ngohuudang commited on Jan 17

Commit

1b76ad1

•

1 Parent(s): 9bb5ff5

update file

Browse files

Files changed (27) hide show

.gitattributes +1 -9
__pycache__/gec_model.cpython-310.pyc +0 -0
__pycache__/gec_model.cpython-311.pyc +0 -0
__pycache__/gec_model.cpython-39.pyc +0 -0
__pycache__/modeling_seq2labels.cpython-310.pyc +0 -0
__pycache__/modeling_seq2labels.cpython-311.pyc +0 -0
__pycache__/modeling_seq2labels.cpython-39.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
__pycache__/utils_gec.cpython-310.pyc +0 -0
__pycache__/utils_gec.cpython-311.pyc +0 -0
__pycache__/utils_gec.cpython-39.pyc +0 -0
__pycache__/vocabulary.cpython-310.pyc +0 -0
__pycache__/vocabulary.cpython-311.pyc +0 -0
__pycache__/vocabulary.cpython-39.pyc +0 -0
config.json +18 -0
configuration_seq2labels.py +62 -0
gec_model.py +449 -0
modeling_seq2labels.py +124 -0
pytorch_model.bin +3 -0
utils_gec.py +233 -0
verb-form-vocab.txt +0 -0
vocabulary.py +277 -0
vocabulary/d_tags.txt +4 -0
vocabulary/labels.txt +15 -0
vocabulary/non_padded_namespaces.txt +2 -0
xlm-roberta-base/config.json +25 -0
xlm-roberta-base/tokenizer.json +0 -0

.gitattributes CHANGED Viewed

@@ -2,34 +2,26 @@
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

__pycache__/gec_model.cpython-310.pyc ADDED Viewed

Binary file (14.1 kB). View file

__pycache__/gec_model.cpython-311.pyc ADDED Viewed

Binary file (25.9 kB). View file

__pycache__/gec_model.cpython-39.pyc ADDED Viewed

Binary file (14.2 kB). View file

__pycache__/modeling_seq2labels.cpython-310.pyc ADDED Viewed

Binary file (3.97 kB). View file

__pycache__/modeling_seq2labels.cpython-311.pyc ADDED Viewed

Binary file (7.03 kB). View file

__pycache__/modeling_seq2labels.cpython-39.pyc ADDED Viewed

Binary file (4.06 kB). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (6.13 kB). View file

__pycache__/utils_gec.cpython-310.pyc ADDED Viewed

Binary file (6.14 kB). View file

__pycache__/utils_gec.cpython-311.pyc ADDED Viewed

Binary file (11.8 kB). View file

__pycache__/utils_gec.cpython-39.pyc ADDED Viewed

Binary file (6.12 kB). View file

__pycache__/vocabulary.cpython-310.pyc ADDED Viewed

Binary file (12.9 kB). View file

__pycache__/vocabulary.cpython-311.pyc ADDED Viewed

Binary file (18.9 kB). View file

__pycache__/vocabulary.cpython-39.pyc ADDED Viewed

Binary file (13 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "architectures": [
+    "Seq2LabelsModel"
+  ],
+  "initializer_range": 0.02,
+  "label_smoothing": 0.0,
+  "load_pretrained": false,
+  "model_type": "bert",
+  "num_detect_classes": 4,
+  "pad_token_id": 0,
+  "predictor_dropout": 0.0,
+  "pretrained_name_or_path": "xlm-roberta-capu/xlm-roberta-base",
+  "special_tokens_fix": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": true,
+  "vocab_size": 15
+}

configuration_seq2labels.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from transformers import PretrainedConfig
+class Seq2LabelsConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Seq2LabelsModel`]. It is used to
+    instantiate a Seq2Labels model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Seq2Labels architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        pretrained_name_or_path (`str`, *optional*, defaults to `bert-base-cased`):
+            Pretrained BERT-like model path
+        load_pretrained (`bool`, *optional*, defaults to `False`):
+            Whether to load pretrained model from `pretrained_name_or_path`
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        predictor_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        special_tokens_fix (`bool`, *optional*, defaults to `False`):
+            Whether to add additional tokens to the BERT's embedding layer.
+    Examples:
+    ```python
+    >>> from transformers import BertModel, BertConfig
+    >>> # Initializing a Seq2Labels style configuration
+    >>> configuration = Seq2LabelsConfig()
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = Seq2LabelsModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bert"
+    def __init__(
+        self,
+        pretrained_name_or_path="bert-base-cased",
+        vocab_size=15,
+        num_detect_classes=4,
+        load_pretrained=False,
+        initializer_range=0.02,
+        pad_token_id=0,
+        use_cache=True,
+        predictor_dropout=0.0,
+        special_tokens_fix=False,
+        label_smoothing=0.0,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.num_detect_classes = num_detect_classes
+        self.pretrained_name_or_path = pretrained_name_or_path
+        self.load_pretrained = load_pretrained
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.predictor_dropout = predictor_dropout
+        self.special_tokens_fix = special_tokens_fix
+        self.label_smoothing = label_smoothing

gec_model.py ADDED Viewed

	@@ -0,0 +1,449 @@

+"""Wrapper of Seq2Labels model. Fixes errors based on model predictions"""
+from collections import defaultdict
+from difflib import SequenceMatcher
+import logging
+import re
+from time import time
+from typing import List, Union
+import warnings
+import sys
+import torch
+from transformers import AutoTokenizer
+from modeling_seq2labels import Seq2LabelsModel
+from vocabulary import Vocabulary
+from utils_gec import PAD, UNK, START_TOKEN, get_target_sent_by_edits
+current_dir = sys.path[0].replace('\\','/')
+logging.getLogger("werkzeug").setLevel(logging.ERROR)
+logger = logging.getLogger(__file__)
+class GecBERTModel(torch.nn.Module):
+    def __init__(
+        self,
+        vocab_path=None,
+        model_paths=None,
+        weights=None,
+        device=None,
+        max_len=64,
+        min_len=3,
+        lowercase_tokens=False,
+        log=False,
+        iterations=3,
+        min_error_probability=0.0,
+        confidence=0,
+        resolve_cycles=False,
+        split_chunk=False,
+        chunk_size=48,
+        overlap_size=12,
+        min_words_cut=6,
+        punc_dict={':', ".", ",", "?"},
+    ):
+        r"""
+        Args:
+            vocab_path (`str`):
+                Path to vocabulary directory.
+            model_paths (`List[str]`):
+                List of model paths.
+            weights (`int`, *Optional*, defaults to None):
+                Weights of each model. Only relevant if `is_ensemble is True`.
+            device (`int`, *Optional*, defaults to None):
+                Device to load model. If not set, device will be automatically choose.
+            max_len (`int`, defaults to 64):
+                Max sentence length to be processed (all longer will be truncated).
+            min_len (`int`, defaults to 3):
+                Min sentence length to be processed (all shorted will be returned w/o changes).
+            lowercase_tokens (`bool`, defaults to False):
+                Whether to lowercase tokens.
+            log (`bool`, defaults to False):
+                Whether to enable logging.
+            iterations (`int`, defaults to 3):
+                Max iterations to run during inference.
+            special_tokens_fix (`bool`, defaults to True):
+               Whether to fix problem with [CLS], [SEP] tokens tokenization.
+            min_error_probability (`float`, defaults to `0.0`):
+                Minimum probability for each action to apply.
+            confidence (`float`, defaults to `0.0`):
+                How many probability to add to $KEEP token.
+            split_chunk (`bool`, defaults to False):
+                Whether to split long sentences to multiple segments of `chunk_size`.
+                !Warning: if `chunk_size > max_len`, each segment will be truncate to `max_len`.
+            chunk_size (`int`, defaults to 48):
+                Length of each segment (in words). Only relevant if `split_chunk is True`.
+            overlap_size (`int`, defaults to 12):
+                Overlap size (in words) between two consecutive segments. Only relevant if `split_chunk is True`.
+            min_words_cut (`int`, defaults to 6):
+                Minimun number of words to be cut while merging two consecutive segments.
+                Only relevant if `split_chunk is True`.
+            punc_dict (List[str], defaults to `{':', ".", ",", "?"}`):
+                List of punctuations.
+        """
+        super().__init__()
+        if isinstance(model_paths, str):
+            model_paths = [model_paths]
+        self.model_weights = list(map(float, weights)) if weights else [1] * len(model_paths)
+        self.device = (
+            torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else torch.device(device)
+        )
+        # self.device = torch.device("cpu")
+        self.max_len = max_len
+        self.min_len = min_len
+        self.lowercase_tokens = lowercase_tokens
+        self.min_error_probability = min_error_probability
+        self.vocab = Vocabulary.from_files(vocab_path)
+        self.incorr_index = self.vocab.get_token_index("INCORRECT", "d_tags")
+        self.log = log
+        self.iterations = iterations
+        self.confidence = confidence
+        self.resolve_cycles = resolve_cycles
+        assert (
+            chunk_size > 0 and chunk_size // 2 >= overlap_size
+        ), "Chunk merging required overlap size must be smaller than half of chunk size"
+        self.split_chunk = split_chunk
+        self.chunk_size = chunk_size
+        self.overlap_size = overlap_size
+        self.min_words_cut = min_words_cut
+        self.stride = chunk_size - overlap_size
+        self.punc_dict = punc_dict
+        self.punc_str = '[' + ''.join([f'\{x}' for x in punc_dict]) + ']'
+        # set training parameters and operations
+        self.indexers = []
+        self.models = []
+        for model_path in model_paths:
+            model = Seq2LabelsModel.from_pretrained(model_path)
+            config = model.config
+            model_name = current_dir + "/" + config.pretrained_name_or_path
+            special_tokens_fix = config.special_tokens_fix
+            self.indexers.append(self._get_indexer(model_name, special_tokens_fix))
+            model.eval().to(self.device)
+            self.models.append(model)
+    def _get_indexer(self, weights_name, special_tokens_fix):
+        tokenizer = AutoTokenizer.from_pretrained(
+            weights_name, do_basic_tokenize=False,
+            do_lower_case=self.lowercase_tokens, model_max_length=1024
+        )
+        # to adjust all tokenizers
+        if hasattr(tokenizer, 'encoder'):
+            tokenizer.vocab = tokenizer.encoder
+        if hasattr(tokenizer, 'sp_model'):
+            tokenizer.vocab = defaultdict(lambda: 1)
+            for i in range(tokenizer.sp_model.get_piece_size()):
+                tokenizer.vocab[tokenizer.sp_model.id_to_piece(i)] = i
+        if special_tokens_fix:
+            tokenizer.add_tokens([START_TOKEN])
+            tokenizer.vocab[START_TOKEN] = len(tokenizer) - 1
+        return tokenizer
+    def forward(self, text: Union[str, List[str], List[List[str]]], is_split_into_words=False):
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+        if not _is_valid_text_input(text):
+            raise ValueError(
+                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+                "or `List[List[str]]` (batch of pretokenized examples)."
+            )
+        if is_split_into_words:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple))
+            if is_batched:
+                text = [x.split() for x in text]
+            else:
+                text = text.split()
+        if not is_batched:
+            text = [text]
+        return self.handle_batch(text)
+    def split_chunks(self, batch):
+        # return batch pairs of indices
+        result = []
+        indices = []
+        for tokens in batch:
+            start = len(result)
+            num_token = len(tokens)
+            if num_token <= self.chunk_size:
+                result.append(tokens)
+            elif num_token > self.chunk_size and num_token < (self.chunk_size * 2 - self.overlap_size):
+                split_idx = (num_token + self.overlap_size + 1) // 2
+                result.append(tokens[:split_idx])
+                result.append(tokens[split_idx - self.overlap_size :])
+            else:
+                for i in range(0, num_token - self.overlap_size, self.stride):
+                    result.append(tokens[i : i + self.chunk_size])
+            indices.append((start, len(result)))
+        return result, indices
+    def check_alnum(self, s):
+        if len(s) < 2:
+            return False
+        return not (s.isalpha() or s.isdigit())
+    def apply_chunk_merging(self, tokens, next_tokens):
+        # Return next tokens if current tokens list is empty
+        if not tokens:
+            return next_tokens
+        source_token_idx = []
+        target_token_idx = []
+        source_tokens = []
+        target_tokens = []
+        num_keep = self.overlap_size - self.min_words_cut
+        i = 0
+        while len(source_token_idx) < self.overlap_size and -i < len(tokens):
+            i -= 1
+            if tokens[i] not in self.punc_dict:
+                source_token_idx.insert(0, i)
+                source_tokens.insert(0, tokens[i].lower())
+        i = 0
+        while len(target_token_idx) < self.overlap_size and i < len(next_tokens):
+            if next_tokens[i] not in self.punc_dict:
+                target_token_idx.append(i)
+                target_tokens.append(next_tokens[i].lower())
+            i += 1
+        matcher = SequenceMatcher(None, source_tokens, target_tokens)
+        diffs = list(matcher.get_opcodes())
+        for diff in diffs:
+            tag, i1, i2, j1, j2 = diff
+            if tag == "equal":
+                if i1 >= num_keep:
+                    tail_idx = source_token_idx[i1]
+                    head_idx = target_token_idx[j1]
+                    break
+                elif i2 > num_keep:
+                    tail_idx = source_token_idx[num_keep]
+                    head_idx = target_token_idx[j2 - i2 + num_keep]
+                    break
+            elif tag == "delete" and i1 == 0:
+                num_keep += i2 // 2
+        tokens = tokens[:tail_idx] + next_tokens[head_idx:]
+        return tokens
+    def merge_chunks(self, batch):
+        result = []
+        if len(batch) == 1 or self.overlap_size == 0:
+            for sub_tokens in batch:
+                result.extend(sub_tokens)
+        else:
+            for _, sub_tokens in enumerate(batch):
+                try:
+                    result = self.apply_chunk_merging(result, sub_tokens)
+                except Exception as e:
+                    print(e)
+        result = " ".join(result)
+        return result
+    def predict(self, batches):
+        t11 = time()
+        predictions = []
+        for batch, model in zip(batches, self.models):
+            batch = batch.to(self.device)
+            with torch.no_grad():
+                prediction = model.forward(**batch)
+            predictions.append(prediction)
+        preds, idx, error_probs = self._convert(predictions)
+        t55 = time()
+        if self.log:
+            print(f"Inference time {t55 - t11}")
+        return preds, idx, error_probs
+    def get_token_action(self, token, index, prob, sugg_token):
+        """Get lost of suggested actions for token."""
+        # cases when we don't need to do anything
+        if prob < self.min_error_probability or sugg_token in [UNK, PAD, '$KEEP']:
+            return None
+        if sugg_token.startswith('$REPLACE_') or sugg_token.startswith('$TRANSFORM_') or sugg_token == '$DELETE':
+            start_pos = index
+            end_pos = index + 1
+        elif sugg_token.startswith("$APPEND_") or sugg_token.startswith("$MERGE_"):
+            start_pos = index + 1
+            end_pos = index + 1
+        if sugg_token == "$DELETE":
+            sugg_token_clear = ""
+        elif sugg_token.startswith('$TRANSFORM_') or sugg_token.startswith("$MERGE_"):
+            sugg_token_clear = sugg_token[:]
+        else:
+            sugg_token_clear = sugg_token[sugg_token.index('_') + 1 :]
+        return start_pos - 1, end_pos - 1, sugg_token_clear, prob
+    def preprocess(self, token_batch):
+        seq_lens = [len(sequence) for sequence in token_batch if sequence]
+        if not seq_lens:
+            return []
+        max_len = min(max(seq_lens), self.max_len)
+        batches = []
+        for indexer in self.indexers:
+            token_batch = [[START_TOKEN] + sequence[:max_len] for sequence in token_batch]
+            batch = indexer(
+                token_batch,
+                return_tensors="pt",
+                padding=True,
+                is_split_into_words=True,
+                truncation=True,
+                add_special_tokens=False,
+            )
+            offset_batch = []
+            for i in range(len(token_batch)):
+                word_ids = batch.word_ids(batch_index=i)
+                offsets = [0]
+                for i in range(1, len(word_ids)):
+                    if word_ids[i] != word_ids[i - 1]:
+                        offsets.append(i)
+                offset_batch.append(torch.LongTensor(offsets))
+            batch["input_offsets"] = torch.nn.utils.rnn.pad_sequence(
+                offset_batch, batch_first=True, padding_value=0
+            ).to(torch.long)
+            batches.append(batch)
+        return batches
+    def _convert(self, data):
+        all_class_probs = torch.zeros_like(data[0]['logits'])
+        error_probs = torch.zeros_like(data[0]['max_error_probability'])
+        for output, weight in zip(data, self.model_weights):
+            class_probabilities_labels = torch.softmax(output['logits'], dim=-1)
+            all_class_probs += weight * class_probabilities_labels / sum(self.model_weights)
+            class_probabilities_d = torch.softmax(output['detect_logits'], dim=-1)
+            error_probs_d = class_probabilities_d[:, :, self.incorr_index]
+            incorr_prob = torch.max(error_probs_d, dim=-1)[0]
+            error_probs += weight * incorr_prob / sum(self.model_weights)
+        max_vals = torch.max(all_class_probs, dim=-1)
+        probs = max_vals[0].tolist()
+        idx = max_vals[1].tolist()
+        return probs, idx, error_probs.tolist()
+    def update_final_batch(self, final_batch, pred_ids, pred_batch, prev_preds_dict):
+        new_pred_ids = []
+        total_updated = 0
+        for i, orig_id in enumerate(pred_ids):
+            orig = final_batch[orig_id]
+            pred = pred_batch[i]
+            prev_preds = prev_preds_dict[orig_id]
+            if orig != pred and pred not in prev_preds:
+                final_batch[orig_id] = pred
+                new_pred_ids.append(orig_id)
+                prev_preds_dict[orig_id].append(pred)
+                total_updated += 1
+            elif orig != pred and pred in prev_preds:
+                # update final batch, but stop iterations
+                final_batch[orig_id] = pred
+                total_updated += 1
+            else:
+                continue
+        return final_batch, new_pred_ids, total_updated
+    def postprocess_batch(self, batch, all_probabilities, all_idxs, error_probs):
+        all_results = []
+        noop_index = self.vocab.get_token_index("$KEEP", "labels")
+        for tokens, probabilities, idxs, error_prob in zip(batch, all_probabilities, all_idxs, error_probs):
+            length = min(len(tokens), self.max_len)
+            edits = []
+            # skip whole sentences if there no errors
+            if max(idxs) == 0:
+                all_results.append(tokens)
+                continue
+            # skip whole sentence if probability of correctness is not high
+            if error_prob < self.min_error_probability:
+                all_results.append(tokens)
+                continue
+            for i in range(length + 1):
+                # because of START token
+                if i == 0:
+                    token = START_TOKEN
+                else:
+                    token = tokens[i - 1]
+                # skip if there is no error
+                if idxs[i] == noop_index:
+                    continue
+                sugg_token = self.vocab.get_token_from_index(idxs[i], namespace='labels')
+                action = self.get_token_action(token, i, probabilities[i], sugg_token)
+                if not action:
+                    continue
+                edits.append(action)
+            all_results.append(get_target_sent_by_edits(tokens, edits))
+        return all_results
+    def handle_batch(self, full_batch, merge_punc=True):
+        """
+        Handle batch of requests.
+        """
+        if self.split_chunk:
+            full_batch, indices = self.split_chunks(full_batch)
+        else:
+            indices = None
+        final_batch = full_batch[:]
+        batch_size = len(full_batch)
+        prev_preds_dict = {i: [final_batch[i]] for i in range(len(final_batch))}
+        short_ids = [i for i in range(len(full_batch)) if len(full_batch[i]) < self.min_len]
+        pred_ids = [i for i in range(len(full_batch)) if i not in short_ids]
+        total_updates = 0
+        for n_iter in range(self.iterations):
+            orig_batch = [final_batch[i] for i in pred_ids]
+            sequences = self.preprocess(orig_batch)
+            if not sequences:
+                break
+            probabilities, idxs, error_probs = self.predict(sequences)
+            pred_batch = self.postprocess_batch(orig_batch, probabilities, idxs, error_probs)
+            if self.log:
+                print(f"Iteration {n_iter + 1}. Predicted {round(100*len(pred_ids)/batch_size, 1)}% of sentences.")
+            final_batch, pred_ids, cnt = self.update_final_batch(final_batch, pred_ids, pred_batch, prev_preds_dict)
+            total_updates += cnt
+            if not pred_ids:
+                break
+        if self.split_chunk:
+            final_batch = [self.merge_chunks(final_batch[start:end]) for (start, end) in indices]
+        else:
+            final_batch = [" ".join(x) for x in final_batch]
+        if merge_punc:
+            final_batch = [re.sub(r'\s+(%s)' % self.punc_str, r'\1', x) for x in final_batch]
+        return final_batch

modeling_seq2labels.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from typing import Any, Dict, List, Optional, Tuple, Union
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModel, BertPreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+import sys
+import torch
+current_dir = sys.path[0].replace('\\','/')
+def get_range_vector(size: int, device: int) -> torch.Tensor:
+    """
+    Returns a range vector with the desired size, starting at 0. The CUDA implementation
+    is meant to avoid copy data from CPU to GPU.
+    """
+    return torch.arange(0, size, dtype=torch.long, device=device)
+class Seq2LabelsOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    detect_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    max_error_probability: Optional[torch.FloatTensor] = None
+class Seq2LabelsModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.num_detect_classes = config.num_detect_classes
+        self.label_smoothing = config.label_smoothing
+        if config.load_pretrained:
+            self.bert = AutoModel.from_pretrained(current_dir + "/" + config.pretrained_name_or_path)
+            bert_config = self.bert.config
+        else:
+            print(current_dir + "/" + config.pretrained_name_or_path)
+            bert_config = AutoConfig.from_pretrained(current_dir + "/" + config.pretrained_name_or_path)
+            self.bert = AutoModel.from_config(bert_config)
+        if config.special_tokens_fix:
+            try:
+                vocab_size = self.bert.embeddings.word_embeddings.num_embeddings
+            except AttributeError:
+                # reserve more space
+                vocab_size = self.bert.word_embedding.num_embeddings + 5
+            self.bert.resize_token_embeddings(vocab_size + 1)
+        predictor_dropout = config.predictor_dropout if config.predictor_dropout is not None else 0.0
+        self.dropout = nn.Dropout(predictor_dropout)
+        self.classifier = nn.Linear(bert_config.hidden_size, config.vocab_size)
+        self.detector = nn.Linear(bert_config.hidden_size, config.num_detect_classes)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_offsets: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        d_tags: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2LabelsOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        if input_offsets is not None:
+            # offsets is (batch_size, d1, ..., dn, orig_sequence_length)
+            range_vector = get_range_vector(input_offsets.size(0), device=sequence_output.device).unsqueeze(1)
+            # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length)
+            sequence_output = sequence_output[range_vector, input_offsets]
+        logits = self.classifier(self.dropout(sequence_output))
+        logits_d = self.detector(sequence_output)
+        loss = None
+        if labels is not None and d_tags is not None:
+            loss_labels_fct = CrossEntropyLoss(label_smoothing=self.label_smoothing)
+            loss_d_fct = CrossEntropyLoss()
+            loss_labels = loss_labels_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            loss_d = loss_d_fct(logits_d.view(-1, self.num_detect_classes), d_tags.view(-1))
+            loss = loss_labels + loss_d
+        if not return_dict:
+            output = (logits, logits_d) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return Seq2LabelsOutput(
+            loss=loss,
+            logits=logits,
+            detect_logits=logits_d,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            max_error_probability=torch.ones(logits.size(0), device=logits.device),
+        )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6e2a5c2b1cbf16a9fd0b88c0dc8585f3832a60d10eea8140854f8d8f32c188d
+size 1112304873

utils_gec.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import os
+from pathlib import Path
+import re
+VOCAB_DIR = Path(__file__).resolve().parent
+PAD = "@@PADDING@@"
+UNK = "@@UNKNOWN@@"
+START_TOKEN = "$START"
+SEQ_DELIMETERS = {"tokens": " ", "labels": "SEPL|||SEPR", "operations": "SEPL__SEPR"}
+def get_verb_form_dicts():
+    path_to_dict = os.path.join(VOCAB_DIR, "verb-form-vocab.txt")
+    encode, decode = {}, {}
+    with open(path_to_dict, encoding="utf-8") as f:
+        for line in f:
+            words, tags = line.split(":")
+            word1, word2 = words.split("_")
+            tag1, tag2 = tags.split("_")
+            decode_key = f"{word1}_{tag1}_{tag2.strip()}"
+            if decode_key not in decode:
+                encode[words] = tags
+                decode[decode_key] = word2
+    return encode, decode
+ENCODE_VERB_DICT, DECODE_VERB_DICT = get_verb_form_dicts()
+def get_target_sent_by_edits(source_tokens, edits):
+    target_tokens = source_tokens[:]
+    shift_idx = 0
+    for edit in edits:
+        start, end, label, _ = edit
+        target_pos = start + shift_idx
+        if start < 0:
+            continue
+        elif len(target_tokens) > target_pos:
+            source_token = target_tokens[target_pos]
+        else:
+            source_token = ""
+        if label == "":
+            del target_tokens[target_pos]
+            shift_idx -= 1
+        elif start == end:
+            word = label.replace("$APPEND_", "")
+            # Avoid appending same token twice
+            if (target_pos < len(target_tokens) and target_tokens[target_pos] == word) or (
+                target_pos > 0 and target_tokens[target_pos - 1] == word
+            ):
+                continue
+            target_tokens[target_pos:target_pos] = [word]
+            shift_idx += 1
+        elif label.startswith("$TRANSFORM_"):
+            word = apply_reverse_transformation(source_token, label)
+            if word is None:
+                word = source_token
+            target_tokens[target_pos] = word
+        elif start == end - 1:
+            word = label.replace("$REPLACE_", "")
+            target_tokens[target_pos] = word
+        elif label.startswith("$MERGE_"):
+            target_tokens[target_pos + 1 : target_pos + 1] = [label]
+            shift_idx += 1
+    return replace_merge_transforms(target_tokens)
+def replace_merge_transforms(tokens):
+    if all(not x.startswith("$MERGE_") for x in tokens):
+        return tokens
+    if tokens[0].startswith("$MERGE_"):
+        tokens = tokens[1:]
+    if tokens[-1].startswith("$MERGE_"):
+        tokens = tokens[:-1]
+    target_line = " ".join(tokens)
+    target_line = target_line.replace(" $MERGE_HYPHEN ", "-")
+    target_line = target_line.replace(" $MERGE_SPACE ", "")
+    target_line = re.sub(r'([\.\,\?\:]\s+)+', r'\1', target_line)
+    return target_line.split()
+def convert_using_case(token, smart_action):
+    if not smart_action.startswith("$TRANSFORM_CASE_"):
+        return token
+    if smart_action.endswith("LOWER"):
+        return token.lower()
+    elif smart_action.endswith("UPPER"):
+        return token.upper()
+    elif smart_action.endswith("CAPITAL"):
+        return token.capitalize()
+    elif smart_action.endswith("CAPITAL_1"):
+        return token[0] + token[1:].capitalize()
+    elif smart_action.endswith("UPPER_-1"):
+        return token[:-1].upper() + token[-1]
+    else:
+        return token
+def convert_using_verb(token, smart_action):
+    key_word = "$TRANSFORM_VERB_"
+    if not smart_action.startswith(key_word):
+        raise Exception(f"Unknown action type {smart_action}")
+    encoding_part = f"{token}_{smart_action[len(key_word):]}"
+    decoded_target_word = decode_verb_form(encoding_part)
+    return decoded_target_word
+def convert_using_split(token, smart_action):
+    key_word = "$TRANSFORM_SPLIT"
+    if not smart_action.startswith(key_word):
+        raise Exception(f"Unknown action type {smart_action}")
+    target_words = token.split("-")
+    return " ".join(target_words)
+def convert_using_plural(token, smart_action):
+    if smart_action.endswith("PLURAL"):
+        return token + "s"
+    elif smart_action.endswith("SINGULAR"):
+        return token[:-1]
+    else:
+        raise Exception(f"Unknown action type {smart_action}")
+def apply_reverse_transformation(source_token, transform):
+    if transform.startswith("$TRANSFORM"):
+        # deal with equal
+        if transform == "$KEEP":
+            return source_token
+        # deal with case
+        if transform.startswith("$TRANSFORM_CASE"):
+            return convert_using_case(source_token, transform)
+        # deal with verb
+        if transform.startswith("$TRANSFORM_VERB"):
+            return convert_using_verb(source_token, transform)
+        # deal with split
+        if transform.startswith("$TRANSFORM_SPLIT"):
+            return convert_using_split(source_token, transform)
+        # deal with single/plural
+        if transform.startswith("$TRANSFORM_AGREEMENT"):
+            return convert_using_plural(source_token, transform)
+        # raise exception if not find correct type
+        raise Exception(f"Unknown action type {transform}")
+    else:
+        return source_token
+# def read_parallel_lines(fn1, fn2):
+#     lines1 = read_lines(fn1, skip_strip=True)
+#     lines2 = read_lines(fn2, skip_strip=True)
+#     assert len(lines1) == len(lines2)
+#     out_lines1, out_lines2 = [], []
+#     for line1, line2 in zip(lines1, lines2):
+#         if not line1.strip() or not line2.strip():
+#             continue
+#         else:
+#             out_lines1.append(line1)
+#             out_lines2.append(line2)
+#     return out_lines1, out_lines2
+def read_parallel_lines(fn1, fn2):
+    with open(fn1, encoding='utf-8') as f1, open(fn2, encoding='utf-8') as f2:
+        for line1, line2 in zip(f1, f2):
+            line1 = line1.strip()
+            line2 = line2.strip()
+            yield line1, line2
+def read_lines(fn, skip_strip=False):
+    if not os.path.exists(fn):
+        return []
+    with open(fn, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    return [s.strip() for s in lines if s.strip() or skip_strip]
+def write_lines(fn, lines, mode='w'):
+    if mode == 'w' and os.path.exists(fn):
+        os.remove(fn)
+    with open(fn, encoding='utf-8', mode=mode) as f:
+        f.writelines(['%s\n' % s for s in lines])
+def decode_verb_form(original):
+    return DECODE_VERB_DICT.get(original)
+def encode_verb_form(original_word, corrected_word):
+    decoding_request = original_word + "_" + corrected_word
+    decoding_response = ENCODE_VERB_DICT.get(decoding_request, "").strip()
+    if original_word and decoding_response:
+        answer = decoding_response
+    else:
+        answer = None
+    return answer
+def get_weights_name(transformer_name, lowercase):
+    if transformer_name == 'bert' and lowercase:
+        return 'bert-base-uncased'
+    if transformer_name == 'bert' and not lowercase:
+        return 'bert-base-cased'
+    if transformer_name == 'bert-large' and not lowercase:
+        return 'bert-large-cased'
+    if transformer_name == 'distilbert':
+        if not lowercase:
+            print('Warning! This model was trained only on uncased sentences.')
+        return 'distilbert-base-uncased'
+    if transformer_name == 'albert':
+        if not lowercase:
+            print('Warning! This model was trained only on uncased sentences.')
+        return 'albert-base-v1'
+    if lowercase:
+        print('Warning! This model was trained only on cased sentences.')
+    if transformer_name == 'roberta':
+        return 'roberta-base'
+    if transformer_name == 'roberta-large':
+        return 'roberta-large'
+    if transformer_name == 'gpt2':
+        return 'gpt2'
+    if transformer_name == 'transformerxl':
+        return 'transfo-xl-wt103'
+    if transformer_name == 'xlnet':
+        return 'xlnet-base-cased'
+    if transformer_name == 'xlnet-large':
+        return 'xlnet-large-cased'
+    return transformer_name

verb-form-vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

vocabulary.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import codecs
+from collections import defaultdict
+import logging
+import os
+import re
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Union, TYPE_CHECKING
+from filelock import FileLock
+logger = logging.getLogger(__name__)
+DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels")
+DEFAULT_PADDING_TOKEN = "@@PADDING@@"
+DEFAULT_OOV_TOKEN = "@@UNKNOWN@@"
+NAMESPACE_PADDING_FILE = "non_padded_namespaces.txt"
+_NEW_LINE_REGEX = re.compile(r"\n|\r\n")
+def namespace_match(pattern: str, namespace: str):
+    """
+    Matches a namespace pattern against a namespace string.  For example, `*tags` matches
+    `passage_tags` and `question_tags` and `tokens` matches `tokens` but not
+    `stemmed_tokens`.
+    """
+    if pattern[0] == "*" and namespace.endswith(pattern[1:]):
+        return True
+    elif pattern == namespace:
+        return True
+    return False
+class _NamespaceDependentDefaultDict(defaultdict):
+    """
+    This is a [defaultdict]
+    (https://docs.python.org/2/library/collections.html#collections.defaultdict) where the
+    default value is dependent on the key that is passed.
+    We use "namespaces" in the :class:`Vocabulary` object to keep track of several different
+    mappings from strings to integers, so that we have a consistent API for mapping words, tags,
+    labels, characters, or whatever else you want, into integers.  The issue is that some of those
+    namespaces (words and characters) should have integers reserved for padding and
+    out-of-vocabulary tokens, while others (labels and tags) shouldn't.  This class allows you to
+    specify filters on the namespace (the key used in the `defaultdict`), and use different
+    default values depending on whether the namespace passes the filter.
+    To do filtering, we take a set of `non_padded_namespaces`.  This is a set of strings
+    that are either matched exactly against the keys, or treated as suffixes, if the
+    string starts with `*`.  In other words, if `*tags` is in `non_padded_namespaces` then
+    `passage_tags`, `question_tags`, etc. (anything that ends with `tags`) will have the
+    `non_padded` default value.
+    # Parameters
+    non_padded_namespaces : `Iterable[str]`
+        A set / list / tuple of strings describing which namespaces are not padded.  If a namespace
+        (key) is missing from this dictionary, we will use :func:`namespace_match` to see whether
+        the namespace should be padded.  If the given namespace matches any of the strings in this
+        list, we will use `non_padded_function` to initialize the value for that namespace, and
+        we will use `padded_function` otherwise.
+    padded_function : `Callable[[], Any]`
+        A zero-argument function to call to initialize a value for a namespace that `should` be
+        padded.
+    non_padded_function : `Callable[[], Any]`
+        A zero-argument function to call to initialize a value for a namespace that should `not` be
+        padded.
+    """
+    def __init__(
+        self,
+        non_padded_namespaces: Iterable[str],
+        padded_function: Callable[[], Any],
+        non_padded_function: Callable[[], Any],
+    ) -> None:
+        self._non_padded_namespaces = set(non_padded_namespaces)
+        self._padded_function = padded_function
+        self._non_padded_function = non_padded_function
+        super().__init__()
+    def add_non_padded_namespaces(self, non_padded_namespaces: Set[str]):
+        # add non_padded_namespaces which weren't already present
+        self._non_padded_namespaces.update(non_padded_namespaces)
+class _TokenToIndexDefaultDict(_NamespaceDependentDefaultDict):
+    def __init__(self, non_padded_namespaces: Set[str], padding_token: str, oov_token: str) -> None:
+        super().__init__(non_padded_namespaces, lambda: {padding_token: 0, oov_token: 1}, lambda: {})
+class _IndexToTokenDefaultDict(_NamespaceDependentDefaultDict):
+    def __init__(self, non_padded_namespaces: Set[str], padding_token: str, oov_token: str) -> None:
+        super().__init__(non_padded_namespaces, lambda: {0: padding_token, 1: oov_token}, lambda: {})
+class Vocabulary:
+    def __init__(
+        self,
+        counter: Dict[str, Dict[str, int]] = None,
+        min_count: Dict[str, int] = None,
+        max_vocab_size: Union[int, Dict[str, int]] = None,
+        non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
+        pretrained_files: Optional[Dict[str, str]] = None,
+        only_include_pretrained_words: bool = False,
+        tokens_to_add: Dict[str, List[str]] = None,
+        min_pretrained_embeddings: Dict[str, int] = None,
+        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
+        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
+    ) -> None:
+        self._padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
+        self._oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
+        self._non_padded_namespaces = set(non_padded_namespaces)
+        self._token_to_index = _TokenToIndexDefaultDict(
+            self._non_padded_namespaces, self._padding_token, self._oov_token
+        )
+        self._index_to_token = _IndexToTokenDefaultDict(
+            self._non_padded_namespaces, self._padding_token, self._oov_token
+        )
+    @classmethod
+    def from_files(
+        cls,
+        directory: Union[str, os.PathLike],
+        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
+        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
+    ) -> "Vocabulary":
+        """
+        Loads a `Vocabulary` that was serialized either using `save_to_files` or inside
+        a model archive file.
+        # Parameters
+        directory : `str`
+            The directory or archive file containing the serialized vocabulary.
+        """
+        logger.info("Loading token dictionary from %s.", directory)
+        padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
+        oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
+        if not os.path.isdir(directory):
+            raise ValueError(f"{directory} not exist")
+        # We use a lock file to avoid race conditions where multiple processes
+        # might be reading/writing from/to the same vocab files at once.
+        with FileLock(os.path.join(directory, ".lock")):
+            with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), "r", "utf-8") as namespace_file:
+                non_padded_namespaces = [namespace_str.strip() for namespace_str in namespace_file]
+            vocab = cls(
+                non_padded_namespaces=non_padded_namespaces,
+                padding_token=padding_token,
+                oov_token=oov_token,
+            )
+            # Check every file in the directory.
+            for namespace_filename in os.listdir(directory):
+                if namespace_filename == NAMESPACE_PADDING_FILE:
+                    continue
+                if namespace_filename.startswith("."):
+                    continue
+                namespace = namespace_filename.replace(".txt", "")
+                if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces):
+                    is_padded = False
+                else:
+                    is_padded = True
+                filename = os.path.join(directory, namespace_filename)
+                vocab.set_from_file(filename, is_padded, namespace=namespace, oov_token=oov_token)
+        return vocab
+    @classmethod
+    def empty(cls) -> "Vocabulary":
+        """
+        This method returns a bare vocabulary instantiated with `cls()` (so, `Vocabulary()` if you
+        haven't made a subclass of this object).  The only reason to call `Vocabulary.empty()`
+        instead of `Vocabulary()` is if you are instantiating this object from a config file.  We
+        register this constructor with the key "empty", so if you know that you don't need to
+        compute a vocabulary (either because you're loading a pre-trained model from an archive
+        file, you're using a pre-trained transformer that has its own vocabulary, or something
+        else), you can use this to avoid having the default vocabulary construction code iterate
+        through the data.
+        """
+        return cls()
+    def set_from_file(
+        self,
+        filename: str,
+        is_padded: bool = True,
+        oov_token: str = DEFAULT_OOV_TOKEN,
+        namespace: str = "tokens",
+    ):
+        """
+        If you already have a vocabulary file for a trained model somewhere, and you really want to
+        use that vocabulary file instead of just setting the vocabulary from a dataset, for
+        whatever reason, you can do that with this method.  You must specify the namespace to use,
+        and we assume that you want to use padding and OOV tokens for this.
+        # Parameters
+        filename : `str`
+            The file containing the vocabulary to load.  It should be formatted as one token per
+            line, with nothing else in the line.  The index we assign to the token is the line
+            number in the file (1-indexed if `is_padded`, 0-indexed otherwise).  Note that this
+            file should contain the OOV token string!
+        is_padded : `bool`, optional (default=`True`)
+            Is this vocabulary padded?  For token / word / character vocabularies, this should be
+            `True`; while for tag or label vocabularies, this should typically be `False`.  If
+            `True`, we add a padding token with index 0, and we enforce that the `oov_token` is
+            present in the file.
+        oov_token : `str`, optional (default=`DEFAULT_OOV_TOKEN`)
+            What token does this vocabulary use to represent out-of-vocabulary characters?  This
+            must show up as a line in the vocabulary file.  When we find it, we replace
+            `oov_token` with `self._oov_token`, because we only use one OOV token across
+            namespaces.
+        namespace : `str`, optional (default=`"tokens"`)
+            What namespace should we overwrite with this vocab file?
+        """
+        if is_padded:
+            self._token_to_index[namespace] = {self._padding_token: 0}
+            self._index_to_token[namespace] = {0: self._padding_token}
+        else:
+            self._token_to_index[namespace] = {}
+            self._index_to_token[namespace] = {}
+        with codecs.open(filename, "r", "utf-8") as input_file:
+            lines = _NEW_LINE_REGEX.split(input_file.read())
+            # Be flexible about having final newline or not
+            if lines and lines[-1] == "":
+                lines = lines[:-1]
+            for i, line in enumerate(lines):
+                index = i + 1 if is_padded else i
+                token = line.replace("@@NEWLINE@@", "\n")
+                if token == oov_token:
+                    token = self._oov_token
+                self._token_to_index[namespace][token] = index
+                self._index_to_token[namespace][index] = token
+        if is_padded:
+            assert self._oov_token in self._token_to_index[namespace], "OOV token not found!"
+    def add_token_to_namespace(self, token: str, namespace: str = "tokens") -> int:
+        """
+        Adds `token` to the index, if it is not already present.  Either way, we return the index of
+        the token.
+        """
+        if not isinstance(token, str):
+            raise ValueError(
+                "Vocabulary tokens must be strings, or saving and loading will break."
+                "  Got %s (with type %s)" % (repr(token), type(token))
+            )
+        if token not in self._token_to_index[namespace]:
+            index = len(self._token_to_index[namespace])
+            self._token_to_index[namespace][token] = index
+            self._index_to_token[namespace][index] = token
+            return index
+        else:
+            return self._token_to_index[namespace][token]
+    def add_tokens_to_namespace(self, tokens: List[str], namespace: str = "tokens") -> List[int]:
+        """
+        Adds `tokens` to the index, if they are not already present.  Either way, we return the
+        indices of the tokens in the order that they were given.
+        """
+        return [self.add_token_to_namespace(token, namespace) for token in tokens]
+    def get_token_index(self, token: str, namespace: str = "tokens") -> int:
+        try:
+            return self._token_to_index[namespace][token]
+        except KeyError:
+            try:
+                return self._token_to_index[namespace][self._oov_token]
+            except KeyError:
+                logger.error("Namespace: %s", namespace)
+                logger.error("Token: %s", token)
+                raise KeyError(
+                    f"'{token}' not found in vocab namespace '{namespace}', and namespace "
+                    f"does not contain the default OOV token ('{self._oov_token}')"
+                )
+    def get_token_from_index(self, index: int, namespace: str = "tokens") -> str:
+        return self._index_to_token[namespace][index]
+    def get_vocab_size(self, namespace: str = "tokens") -> int:
+        return len(self._token_to_index[namespace])
+    def get_namespaces(self) -> Set[str]:
+        return set(self._index_to_token.keys())

vocabulary/d_tags.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+CORRECT
+INCORRECT
+@@UNKNOWN@@
+@@PADDING@@

vocabulary/labels.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+$KEEP
+$TRANSFORM_CASE_CAPITAL
+$APPEND_,
+$APPEND_.
+$TRANSFORM_VERB_VB_VBN
+$TRANSFORM_CASE_UPPER
+$APPEND_:
+$APPEND_?
+$TRANSFORM_VERB_VB_VBC
+$TRANSFORM_CASE_LOWER
+$TRANSFORM_CASE_CAPITAL_1
+$TRANSFORM_CASE_UPPER_-1
+$MERGE_SPACE
+@@UNKNOWN@@
+@@PADDING@@

vocabulary/non_padded_namespaces.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *tags
2	+ *labels

xlm-roberta-base/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "XLMRobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.17.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

xlm-roberta-base/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff