3v324v23 commited on Jun 1, 2023

Commit

8e64bfa

1 Parent(s): ea83b6a

update

Browse files

Files changed (18) hide show

.gitignore +168 -0
config.json +28 -0
modeling/__init__.py +0 -37
modeling/bert.py +0 -7
modeling/cache_utils.py +0 -58
modeling/config.py +106 -0
modeling/da_utils.py +0 -1
modeling/deberta.py +0 -4
modeling/disentangled_attention.py +0 -3
modeling/flash.py +0 -794
modeling/focal_loss.py +0 -200
modeling/gat.py +0 -665
modeling/mlm.py +0 -38
modeling/modeling.py +0 -0
modeling/nnmodule.py +0 -184
modeling/ops.py +2 -4
modeling/pretrained_models.py +0 -2
modeling/wywlm_modeling.py +0 -446

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+nohup.out
+output/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+/inv
+# data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+#ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+inv.py

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "_name_or_path": "bozhou/DeBERTa-base",
+    "architectures": [
+      "DeBERTa"
+    ],
+    "auto_map": {
+      "AutoConfig": "modeling.config.ModelConfig",
+      "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+      "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
+    },
+    "bos_token_id": 130004,
+    "eos_token_id": 130005,
+    "mask_token_id": 130000,
+    "gmask_token_id": 130001,
+    "pad_token_id": 3,
+    "hidden_size": 4096,
+    "inner_hidden_size": 16384,
+    "layernorm_epsilon": 1e-05,
+    "max_sequence_length": 2048,
+    "model_type": "chatglm",
+    "num_attention_heads": 32,
+    "num_layers": 28,
+    "position_encoding_2d": true,
+    "torch_dtype": "float16",
+    "transformers_version": "4.23.1",
+    "use_cache": true,
+    "vocab_size": 130528
+  }

modeling/__init__.py CHANGED Viewed

@@ -1,37 +0,0 @@
-#
-# Zhou Bo
-#
-""" Components for NN
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from .tokenizers import *
-from .pooling import *
-from .mlm import MLMPredictionHead
-from .nnmodule import NNModule
-from .deberta import *
-from .disentangled_attention import *
-from .ops import *
-from .bert import *
-from .config import *
-from .cache_utils import *
-from .focal_loss import *
-# from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
-from .modeling import (BertConfig, BertModel, BertForPreTraining, BertForMaskedLM,
-                        BertForNextSentencePrediction, PreTrainedBertModel,
-                        BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
-                        BertForQuestionAnswering, BertForPreTrainingLossMask, BertPreTrainingPairRel,
-                        BertPreTrainingPairTransform, BertPreTrainingHeads, MLMHead)
-# from .optimization import BertAdam, BertAdamFineTune
-try:
-    from .optimization_fp16 import FP16_Optimizer_State
-except:
-    pass
-from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from .flash import FlashQuadModel
-from .gat import GatModel

modeling/bert.py CHANGED Viewed

@@ -6,17 +6,10 @@
 # This piece of code is modified based on https://github.com/huggingface/transformers
-import copy
 import torch
 from torch import nn
 from collections import Sequence
 from packaging import version
-import numpy as np
-import math
-import os
-import pdb
-import json
 from .ops import *
 from .disentangled_attention import *
 from .da_utils import *

 # This piece of code is modified based on https://github.com/huggingface/transformers
 import torch
 from torch import nn
 from collections import Sequence
 from packaging import version
 from .ops import *
 from .disentangled_attention import *
 from .da_utils import *

modeling/cache_utils.py CHANGED Viewed

@@ -13,10 +13,7 @@ import os
 import requests
 from .config import ModelConfig
 import pathlib
-from ..utils import xtqdm as tqdm
-from zipfile import ZipFile
 import loguru
-# from ..utils import get_logger
 logger = loguru.logger
 __all__ = ['pretrained_models', 'load_model_state', 'load_vocab']
@@ -49,36 +46,7 @@ pretrained_models= {
     'deberta-v3-xsmall': PretrainedModel('deberta-v3-xsmall', 'spm.model', 'spm'),
   }
-def download_asset(url, name, tag=None, no_cache=False, cache_dir=None):
-  _tag = tag
-  if _tag is None:
-    _tag = 'latest'
-  if not cache_dir:
-    cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/')
-  os.makedirs(cache_dir, exist_ok=True)
-  output=os.path.join(cache_dir, name)
-  if os.path.exists(output) and (not no_cache):
-    return output
-  #repo=f'https://huggingface.co/microsoft/deberta-{name}/blob/main/bpe_encoder.bin'
-  headers = {}
-  headers['Accept'] = 'application/octet-stream'
-  resp = requests.get(url, stream=True, headers=headers)
-  if resp.status_code != 200:
-    raise Exception(f'Request for {url} return {resp.status_code}, {resp.text}')
-  try:
-    with open(output, 'wb') as fs:
-      progress = tqdm(total=int(resp.headers['Content-Length']) if 'Content-Length' in resp.headers else -1, ncols=80, desc=f'Downloading {name}')
-      for c in resp.iter_content(chunk_size=1024*1024):
-        fs.write(c)
-        progress.update(len(c))
-      progress.close()
-  except:
-    os.remove(output)
-    raise
-  return output
 def load_model_state(path_or_pretrained_id, tag=None, no_cache=False, cache_dir=None):
   model_path = path_or_pretrained_id
@@ -91,9 +59,6 @@ def load_model_state(path_or_pretrained_id, tag=None, no_cache=False, cache_dir=
       cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
     os.makedirs(cache_dir, exist_ok=True)
     model_path = os.path.join(cache_dir, 'pytorch_model.bin')
-    if (not os.path.exists(model_path)) or no_cache:
-      asset = download_asset(pretrained.model_url, 'pytorch_model.bin', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
-      asset = download_asset(pretrained.config_url, 'model_config.json', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
   elif not model_path:
     return None,None
@@ -107,26 +72,3 @@ def load_model_state(path_or_pretrained_id, tag=None, no_cache=False, cache_dir=
   else:
     model_config = None
   return model_state, model_config
-def load_vocab(vocab_path=None, vocab_type=None, pretrained_id=None, tag=None, no_cache=False, cache_dir=None):
-  if pretrained_id and (pretrained_id.lower() in pretrained_models):
-    _tag = tag
-    if _tag is None:
-      _tag = 'latest'
-    pretrained = pretrained_models[pretrained_id.lower()]
-    if not cache_dir:
-      cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
-    os.makedirs(cache_dir, exist_ok=True)
-    vocab_type = pretrained.vocab_type
-    url = pretrained.vocab_url
-    outname = os.path.basename(url)
-    vocab_path =os.path.join(cache_dir, outname)
-    if (not os.path.exists(vocab_path)) or no_cache:
-      asset = download_asset(url, outname, tag=tag, no_cache=no_cache, cache_dir=cache_dir)
-  if vocab_type is None:
-    vocab_type = 'spm'
-  return vocab_path, vocab_type
-def test_download():
-  vocab = load_vocab()

 import requests
 from .config import ModelConfig
 import pathlib
 import loguru
 logger = loguru.logger
 __all__ = ['pretrained_models', 'load_model_state', 'load_vocab']
     'deberta-v3-xsmall': PretrainedModel('deberta-v3-xsmall', 'spm.model', 'spm'),
   }
 def load_model_state(path_or_pretrained_id, tag=None, no_cache=False, cache_dir=None):
   model_path = path_or_pretrained_id
       cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
     os.makedirs(cache_dir, exist_ok=True)
     model_path = os.path.join(cache_dir, 'pytorch_model.bin')
   elif not model_path:
     return None,None
   else:
     model_config = None
   return model_state, model_config

modeling/config.py CHANGED Viewed

@@ -1,8 +1,114 @@
 import json
 import copy
 __all__=['AbsModelConfig', 'ModelConfig']
 class AbsModelConfig(object):
     def __init__(self):
         pass

 import json
 import copy
+from transformers.configuration_utils import PretrainedConfig
 __all__=['AbsModelConfig', 'ModelConfig']
+class DebertaConfig(PretrainedConfig):
+    model_type = 'deberta-v2'
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 relax_projection=0,
+                 new_pos_ids=False,
+                 initializer_range=0.02,
+                 task_idx=None,
+                 fp32_embedding=False,
+                 ffn_type=0,
+                 label_smoothing=None,
+                 num_qkv=0,
+                 seg_emb=False):
+        """Constructs BertConfig.
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.relax_projection = relax_projection
+            self.new_pos_ids = new_pos_ids
+            self.initializer_range = initializer_range
+            self.task_idx = task_idx
+            self.fp32_embedding = fp32_embedding
+            self.ffn_type = ffn_type
+            self.label_smoothing = label_smoothing
+            self.num_qkv = num_qkv
+            self.seg_emb = seg_emb
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+    # @classmethod
+    # def from_dict(cls, json_object):
+    #     """Constructs a `BertConfig` from a Python dictionary of parameters."""
+    #     config = DebertaConfig(vocab_size_or_config_json_file=-1)
+    #     for key, value in json_object.items():
+    #         config.__dict__[key] = value
+    #     return config
+    # @classmethod
+    # def from_json_file(cls, json_file):
+    #     """Constructs a `BertConfig` from a json file of parameters."""
+    #     with open(json_file, "r", encoding='utf-8') as reader:
+    #         text = reader.read()
+    #     return cls.from_dict(json.loads(text))
+    # def __repr__(self):
+    #     return str(self.to_json_string())
+    # def to_dict(self):
+    #     """Serializes this instance to a Python dictionary."""
+    #     output = copy.deepcopy(self.__dict__)
+    #     return output
+    # def to_json_string(self):
+    #     """Serializes this instance to a JSON string."""
+    #     return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 class AbsModelConfig(object):
     def __init__(self):
         pass

modeling/da_utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import torch
-import pdb
 from functools import lru_cache
 import numpy as np

 import torch
 from functools import lru_cache
 import numpy as np

modeling/deberta.py CHANGED Viewed

@@ -9,14 +9,10 @@
 import copy
 import torch
-import os
-import json
 from .ops import *
 from .bert import *
 from .config import ModelConfig
 from .cache_utils import load_model_state
-import pdb
 __all__ = ['DeBERTa']

 import copy
 import torch
 from .ops import *
 from .bert import *
 from .config import ModelConfig
 from .cache_utils import load_model_state
 __all__ = ['DeBERTa']

modeling/disentangled_attention.py CHANGED Viewed

@@ -11,12 +11,9 @@
   Disentangled SelfAttention module
 """
-import numpy as np
 import math
 import torch
 from torch import nn
-import functools
-import pdb
 from .ops import *
 from .da_utils import build_relative_position

   Disentangled SelfAttention module
 """
 import math
 import torch
 from torch import nn
 from .ops import *
 from .da_utils import build_relative_position

modeling/flash.py DELETED Viewed

@@ -1,794 +0,0 @@
-#
-# Zhoubo
-#
-"""
-    FLASH: https://arxiv.org/abs/2202.10447
-"""
-import copy
-import torch
-import os
-from collections import Sequence
-import json
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers.activations import ACT2FN
-from .modeling import *
-from .ops import XSoftmax, sequence_masking
-from .bert import *
-from .config import ModelConfig
-from .cache_utils import load_model_state
-import einops
-class ScaleNorm(nn.Module):
-    def __init__(self, eps=1e-5):
-        super().__init__()
-        self.eps = eps
-        self.scala = nn.Parameter(torch.ones(1))
-    def forward(self, x):
-        mean_square = (x ** 2).mean(dim=-1, keepdim=True)
-        x = x * torch.rsqrt(mean_square + self.eps) * self.scala
-        return x
-class OffsetScale(nn.Module):
-    def __init__(self, dim, heads = 1):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.ones(heads, dim))
-        self.beta = nn.Parameter(torch.zeros(heads, dim))
-        # nn.init.normal_(self.gamma, std = 0.02)
-        # nn.init.xavier_uniform_(self.gamma)
-    def forward(self, x):
-        out = (x * self.gamma) + self.beta
-        return out
-class ScaledSinuEmbedding(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(1,))
-        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
-    def forward(self, x):
-        n, device = x.shape[1], x.device
-        t = torch.arange(n, device = device).type_as(self.inv_freq)
-        sinu = torch.einsum('i , j -> i j', t, self.inv_freq)
-        emb = torch.cat((sinu.sin(), sinu.cos()), dim = -1)
-        return emb * self.scale
-def RoPE(x, dim):
-    """
-    :param x: input tensor
-    :param dim: oprate dimension
-    :return: tensor
-    """
-    shape = x.shape
-    if isinstance(dim, int):
-        dim = [dim]
-    spatial_shape = [shape[i] for i in dim]
-    total_len = 1
-    for i in spatial_shape:
-        total_len *= i
-    position = torch.reshape(torch.arange(total_len, dtype=torch.float, device=x.device), spatial_shape)
-    for i in range(dim[-1] + 1, len(shape) - 1, 1):
-        position = torch.unsqueeze(position, dim=-1)
-    half_size = shape[-1] // 2
-    freq_seq = -torch.arange(half_size, dtype=torch.float, device=x.device) / float(half_size)
-    inv_freq = 10000 ** -freq_seq
-    sinusoid = torch.einsum("...,d->...d", position, inv_freq)
-    sin = torch.sin(sinusoid).repeat_interleave(2, -1)
-    cos = torch.cos(sinusoid).repeat_interleave(2, -1)
-    tensor_cross = torch.stack([-x[..., 1:: 2], x[..., :: 2]], -1).reshape(x.shape)
-    # x1, x2 = torch.chunk(x, 2, dim=-1)
-    # return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1)
-    return x * cos + tensor_cross * sin
-def rel_pos_bias(seq_len, s):
-    a = torch.rand([1, s], dtype=torch.float)
-    b = torch.rand([1, s], dtype=torch.float)
-    w = torch.rand([2 * seq_len - 1], dtype=torch.float)
-    if seq_len <= 512:
-        t = F.pad(w[: 2 * seq_len - 1], [0, seq_len]).repeat(seq_len)
-        t = t[..., :-seq_len].reshape(-1, seq_len, 3 * seq_len - 2)
-        r = (2 * seq_len - 1) // 2
-        t = t[..., r:-r]
-    else:
-        a = RoPE(a.repeat(seq_len, 1), dim=[0])
-        b = RoPE(b.repeat(seq_len, 1), dim=[0])
-        t = torch.einsum("mk,nk->mn", a, b)
-    return t
-def squared_relu(x, attention_mask, dim=-1):
-    rmask = ~(attention_mask.bool())
-    x = x.masked_fill(rmask, 0)
-    return torch.square(F.relu(x))
-def attention_normalize(a, axis=-1, mask=None, fn='softmax'):
-    if fn == 'softmax':
-        return XSoftmax.apply(a, mask, axis)
-    else:
-        mask_ = a > -float('inf') / 10
-        # mask_ = mask_.byte()
-        mask_ = torch.sum(mask_, axis=axis, keepdim=True)
-        l = torch.maximum(mask_, torch.ones_like(mask_))
-        if fn == 'squared_relu':
-            rmask = ~(mask.bool())
-            a = a.masked_fill(rmask, 0)
-            return torch.square(F.relu(a)) / l
-        elif fn == 'softmax_plus':
-            return XSoftmax.apply(a * torch.log(l) / np.log(512), mask, axis)
-    return a
-class GAULinear(nn.Linear):
-    def init_weight(self):
-        nn.init.xavier_uniform_(self.weight)
-class GatedAttentionUnit(nn.Module):
-    """
-    GAU Block: Gate Attention Unit
-    """
-    def __init__(
-        self,
-        max_seq_length,
-        hidden_size,
-        attention_key_size=128,
-        activation='swish',
-        use_bias=True,
-        attention_norm_type='squared_relu',
-        attention_scale=True,
-        dropout=0.1,
-        pre_norm=False,
-        norm_type="layer_norm",
-        eps=1e-5,
-        shift_token=False,
-        use_rel_bias=False,
-        add_residual=True,
-        **kwargs,):
-        super(GatedAttentionUnit, self).__init__(**kwargs)
-        self.max_seq_length = max_seq_length
-        self.units = hidden_size
-        self.intermediate_size = self.units * 2
-        self.key_size = attention_key_size
-        self.activation = activation
-        self.use_bias = use_bias
-        self.attention_norm_type = attention_norm_type
-        self.attention_scale = attention_scale
-        self.dropout = StableDropout(dropout)
-        self.i_dense = nn.Sequential(
-            nn.Linear(self.units, 2 * self.intermediate_size + self.key_size, bias=self.use_bias),
-            nn.SiLU()
-        )
-        self.o_dense = nn.Sequential(
-                        nn.Linear(self.intermediate_size, self.units, bias=self.use_bias),
-                        self.dropout)
-        self.q_scaleoffset = OffsetScale(self.key_size)
-        self.k_scaleoffset = OffsetScale(self.key_size)
-        self.pre_norm = pre_norm
-        self.norm = (nn.LayerNorm(hidden_size, eps=eps) if norm_type.lower() == "layer_norm" else ScaleNorm(eps=eps))
-        self.add_residual = add_residual
-    def forward(self, x, attention_mask=None, **kwargs):
-        shortcut = x
-        if self.pre_norm:
-            x = self.norm(x)
-        x = self.i_dense(x)
-        u, v, qk = torch.split(x, [self.intermediate_size, self.intermediate_size, self.key_size], dim=-1)
-        q, k = self.q_scaleoffset(qk), self.k_scaleoffset(qk)
-        qk = RoPE(torch.stack([q, k], 2), dim=1)
-        q, k = qk[:, :, 0], qk[:, :, 1]
-        a = torch.einsum('bmd,bnd->bmn', q, k)
-        if self.attention_scale:
-            a = a / self.key_size**0.5
-        a = sequence_masking(a, attention_mask, '-inf', -1)
-        A = attention_normalize(a, -1, fn=self.attention_norm_type)
-        if self.dropout:
-            A = self.dropout(A)
-        out = self.o_dense(u * torch.einsum('bmn,bnd->bmd', A, v))
-        if self.add_residual:
-            out = out + shortcut
-        if not self.pre_norm:
-            out = self.norm(out)
-        return out
-        # # 加入RoPE
-        # if p_bias == 'rotary':
-        #     qk = K.stack([q, k], 2)
-        #     qk = apply_rotary_position_embeddings(inputs[n], qk)[0]
-        #     q, k = qk[:, :, 0], qk[:, :, 1]
-        # # Attention
-        # a = tf.einsum('bmd,bnd->bmn', q, k)
-        # if self.attention_scale:
-        #     a = a / self.key_size**0.5
-        # if a_bias is not None:
-        #     a = a + a_bias
-        # a = sequence_masking(a, mask, '-inf', -1)
-        # A = attention_normalize(a, -1, self.normalization)
-        # if self.attention_dropout:
-        #     A = Dropout(self.attention_dropout)(A)
-        # # 计算输出
-        # o = self.o_dense(u * tf.einsum('bmn,bnd->bmd', A, v))
-        # return o
-class GAU(nn.Module):
-    def __init__(self, max_seq_length, hidden_size, expansion_factor=2, s=128, norm_type="layer_norm", eps=1e-5,
-                 hidden_act="silu", shift_token=False, use_rel_bias=False, attention_norm_type='softmax',
-                 pre_norm=False, dropout=0, add_residual = True):
-        super(GAU, self).__init__()
-        self.max_seq_length = max_seq_length
-        self.shift_token = shift_token
-        hidden_dim = int(expansion_factor * hidden_size)
-        self.norm = (nn.LayerNorm(hidden_size, eps=eps) if norm_type == "layer_norm" else ScaleNorm(eps=eps))
-        self.use_rel_bias = use_rel_bias
-        self.attention_norm_type = attention_norm_type
-        # if attention_norm_type == 'relu':
-        #     self.attention_norm_func = squared_relu
-        # else:
-        #     self.attention_norm_func = XSoftmax.apply
-        # self.norm = norm_klass(hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.to_hidden = nn.Sequential(
-            nn.Linear(hidden_size, hidden_dim * 2),
-            nn.SiLU()
-        )
-        self.to_qk = nn.Sequential(
-            nn.Linear(hidden_size, s),
-            nn.SiLU()
-        )
-        self.offsetscale = OffsetScale(s, heads = 2)
-        self.to_out = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_size),
-            nn.Dropout(dropout)
-        )
-        self.add_residual = add_residual
-        self.act_fn = ACT2FN[hidden_act]
-        self.pre_norm = pre_norm
-    def forward(
-        self,
-        x,
-        relative_pos = None,
-        attention_mask = None
-    ):
-        seq_len, device = x.shape[-2], x.device
-        if self.pre_norm:
-            normed_x = self.norm(x)
-        else:
-            normed_x = x
-        v, gate = self.to_hidden(normed_x).chunk(2, dim = -1)
-        qk = self.to_qk(normed_x)
-        base = self.offsetscale(qk)
-        base = RoPE(base, 1)
-        q, k = base.unbind(dim = -2)
-        sim = torch.einsum('b i d, b j d -> b i j', q, k)
-        if relative_pos is not None:
-            sim = sim + relative_pos
-        if attention_mask is not None:
-            if attention_mask.dim() < 3:
-                attention_mask = einops.rearrange(attention_mask, 'b j -> b 1 j')
-            # attn = attn.masked_fill(~attention_mask.bool(), 0.)
-        attn = attention_normalize(sim, mask=attention_mask, fn=self.attention_norm_type)
-        # attn = F.relu(sim) ** 2 / seq_len# / q.size(-1)
-        # logger.info(attn.max())
-        attn = self.dropout(attn)
-        # if self.causal:
-        #     causal_mask = torch.ones((seq_len, seq_len), dtype = torch.bool, device = device).triu(1)
-        #     attn = attn.masked_fill(causal_mask, 0.)
-        out = torch.einsum('b i j, b j d -> b i d', attn, v)
-        out = out * gate
-        out = self.to_out(out)
-        if self.add_residual:
-            out = out + x
-        if not self.pre_norm:
-            out = self.norm(out)
-        return out
-class GAULayer(nn.Module):
-    def __init__(self, config, shift_token=False, use_ffn=False):
-        super(GAULayer, self).__init__()
-        self.attention = GatedAttentionUnit(config.max_position_embeddings, config.hidden_size,
-                                    shift_token=shift_token, use_rel_bias=config.use_rel_bias,
-                                    norm_type=config.norm_type, attention_norm_type=config.attention_norm_type,
-                                    pre_norm=config.pre_norm, dropout=config.hidden_dropout_prob)
-        if use_ffn:
-            self.intermediate = BertIntermediate(config)
-            self.output = BertOutput(config)
-        self.use_ffn = use_ffn
-    def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
-        attention_output = self.attention(hidden_states, attention_mask=attention_mask, relative_pos=relative_pos)
-        if self.use_ffn:
-            intermediate_output = self.intermediate(attention_output)
-            layer_output = self.output(intermediate_output, attention_output)
-            return layer_output
-        else:
-            return attention_output
-class FlashBlock(nn.Module):
-    """
-    FLASH Block: Fast Linear Attention with a Single Head
-    """
-    def __init__(self, model_size, sequence_length, chunk_size=256, expansion_factor=2, s=128, norm_type="layer_norm", eps=1e-5,
-                 hidden_act="silu"):
-        super(FlashBlock, self).__init__()
-        self.s = s
-        self.eps = eps
-        self.norm_type = norm_type
-        self.model_size = model_size
-        self.chunk_size = chunk_size
-        self.hidden_act = hidden_act
-        self.sequence_length = sequence_length
-        self.expansion_factor = expansion_factor
-        self.e = int(self.model_size * self.expansion_factor)
-        self.dense1 = nn.Linear(self.model_size, 2 * self.e + self.s, bias=True)
-        self.gamma = nn.Parameter(torch.rand((4, self.s)))
-        self.beta = nn.Parameter(torch.rand((4, self.s)))
-        self.dense2 = nn.Linear(self.e, self.model_size)
-        self.LayerNorm = (
-            nn.LayerNorm(model_size, eps=self.eps) if norm_type == "layer_norm" else ScaleNorm(eps=self.eps))
-        nn.init.xavier_normal_(self.dense1.weight)
-        self.act_fn = ACT2FN(self.hidden_act)
-    def global_linear_attention(self, query, key, value, causal):
-        if causal:
-            kv = torch.einsum("bgcs, bgce->bgse", key, value)
-            kv = torch.cumsum(kv, dim=1)
-            lin_v = torch.einsum("bgcs, bgse->bgce", query, kv)
-            return lin_v
-        else:
-            kv = torch.einsum("bgcs, bgce->bse", key, value)
-            lin_v = torch.einsum("bgcs, bse->bgce", query, kv)
-            return lin_v
-    def segment_ids_to_mask(self, segment_ids, causal=False):
-        """Generate the segment mask from the segment ids.
-        The segment mask is used to remove the attention between tokens in different documents.
-        """
-        min_ids, max_ids = torch.min(segment_ids, dim=-1).values, torch.max(segment_ids, dim=-1).values
-        # 1.0 indicates in the same group and 0.0 otherwise
-        mask = torch.logical_and(torch.less_equal(min_ids[:, :, None], max_ids[:, None, :]),
-                                 torch.greater_equal(max_ids[:, :, None], min_ids[:, None, :]))
-        mask = torch.tensor(mask, torch.float32)
-        if causal:
-            g = segment_ids.size()[1]
-            causal_mask = 1.0 - torch.triu(torch.ones([g, g], dtype=torch.float32))  # 保留主对角线以及主对角线以上的元素
-            mask *= causal_mask
-        mask = torch.div(mask, torch.sum(mask, dim=-1, keepdim=True))
-        return mask
-    def forward(self, x, causal=False, attention_mask=None, sequence_mask=None, **kwargs):
-        """
-        inputs: [batch_size, num_chunk, chunk_length, model_size]
-        """
-        _, g, n, d = x.size()
-        shortcut, x = x, self.LayerNorm(x)
-        # 通过线性变换得到Z，见论文公式(4)
-        uv = self.dense1(x)
-        # 将uv按最后一维切分，得到Ug:[C*e],Vg:[C*e], Zg:[C*s], 论文中的3.2部分
-        # u:[batch_size, num_chunk, chunk_length, self.e]
-        # v:[batch_size, num_chunk, chunk_length, self.e]
-        # z:[batch_size, num_chunk, chunk_length, self.s]
-        u, v, z = torch.split(self.act_fn(uv), [self.e, self.e, self.s], dim=-1)
-        # 生���quad_q, quad_k, lin_q, lin_k
-        # 首先进行简单的offset和scale,融入RoPE位置向量
-        z = torch.einsum("...r, hr->...hr", z, self.gamma) + self.beta
-        z = RoPE(z, dim=[1, 2])
-        quad_q, quad_k, lin_q, lin_k = torch.unbind(z, dim=-2)  # 按-2维进行分解得到quad_q, quad_k, lin_q和lin_k
-        # 计算global的lin_v
-        lin_v = self.global_linear_attention(lin_q, lin_k, v, causal)
-        if causal:
-            # 线性注意力部分
-            lin_kv = torch.einsum("bgnk, bgne->bgke", lin_k, lin_v) / torch.tensor(n, x.dtype)  # 见公式(7)
-            mask = self.segment_ids_to_mask(segment_ids=segment_ids, causal=causal)
-            cum_lin_kv = torch.einsum('bhke, bgh->bgke', lin_kv, mask)
-            linear = torch.einsum("bgnk, bgke->bgne", lin_kv, cum_lin_kv)
-            # 二次注意力
-            quad_qk = torch.einsum("bgnk, bgmk->bgnm", quad_q, quad_k)  # 论文Local attention per chunk部分
-            bias = rel_pos_bias(self.sequence_length, self.s)[:, :n, :n]
-            kernel = torch.square(F.relu(quad_qk / n + bias))  # 论文中的relu**2部分
-            causal_mask = torch.triu(torch.ones([n, n], dtype=x.dtype))
-            quadratic = torch.einsum("bgnm, bgme->bgne", kernel * causal_mask, v)
-        else:
-            lin_kv = torch.einsum("bgnk, bgne->bgke", lin_k, lin_v) / torch.tensor(n, x.dtype)  # 见公式(7)
-            mask = self.segment_ids_to_mask(segment_ids=segment_ids, causal=causal)
-            lin_kv = torch.einsum("bhke, bgh->bgke", lin_kv, mask)
-            linear = torch.einsum("bgnk, bgke->bgne", lin_q, lin_kv)
-            # 二次注意力
-            quad_qk = torch.einsum("bgnk, bgmk->bgnm", quad_q, quad_k)  # 论文Local attention per chunk部分
-            bias = rel_pos_bias(self.sequence_length, self.s)[:, :n, :n]
-            kernel = torch.square(F.relu(quad_qk / n + bias))  # 论文中的relu**2部分
-            quadratic = torch.einsum("bgnm, bgme->bgne", kernel, v)
-        x = u * (quadratic + linear)
-        x = self.dense2(x)
-        x = x + shortcut
-        return x
-class RelativePositionBias(nn.Module):
-    def __init__(
-        self,
-        scale,
-        causal = False,
-        num_buckets = 32,
-        max_distance = 128
-    ):
-        super().__init__()
-        self.scale = scale
-        self.causal = causal
-        self.num_buckets = num_buckets
-        self.max_distance = max_distance
-        self.relative_attention_bias = nn.Embedding(num_buckets, 1)
-    @staticmethod
-    def _relative_position_bucket(
-        relative_position,
-        causal = True,
-        num_buckets = 32,
-        max_distance = 128
-    ):
-        ret = 0
-        n = -relative_position
-        if not causal:
-            num_buckets //= 2
-            ret += (n < 0).long() * num_buckets
-            n = torch.abs(n)
-        else:
-            n = torch.max(n, torch.zeros_like(n))
-        max_exact = num_buckets // 2
-        is_small = n < max_exact
-        val_if_large = max_exact + (
-            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
-        ).long()
-        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
-        ret += torch.where(is_small, n, val_if_large)
-        return ret
-    def forward(self, x):
-        i, j, device = *x.shape[-2:], x.device
-        q_pos = torch.arange(i, dtype = torch.long, device = device)
-        k_pos = torch.arange(j, dtype = torch.long, device = device)
-        rel_pos = rearrange(k_pos, 'j -> 1 j') - rearrange(q_pos, 'i -> i 1')
-        rp_bucket = self._relative_position_bucket(rel_pos, causal = self.causal, num_buckets = self.num_buckets, max_distance = self.max_distance)
-        values = self.relative_attention_bias(rp_bucket)
-        bias = rearrange(values, 'i j 1 -> i j')
-        return bias * self.scale
-class FlashEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-    def __init__(self, config, with_position=False):
-        super(FlashEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(
-            config.type_vocab_size, config.hidden_size)
-        self.with_position = with_position
-        if with_position:
-            self.position_embeddings = ScaledSinuEmbedding(config.hidden_size)
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5)
-        self.dropout = StableDropout(config.hidden_dropout_prob)
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, token_mask=None):
-        seq_length = input_ids.size(1)
-        if position_ids is None:
-            position_ids = torch.arange(
-                seq_length, dtype=torch.long, device=input_ids.device)
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-        words_embeddings = self.word_embeddings(input_ids)
-        if self.with_position:
-            position_embeddings = self.position_embeddings(words_embeddings)
-        else:
-            position_embeddings = 0
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-        # if self.num_pos_emb > 1:
-        #     num_batch = position_embeddings.size(0)
-        #     num_pos = position_embeddings.size(1)
-        #     position_embeddings = position_embeddings.view(
-        #         num_batch, num_pos, self.num_pos_emb, -1)[torch.arange(0, num_batch).long(), :, task_idx, :]
-        embeddings = words_embeddings + position_embeddings + token_type_embeddings
-        # if self.fp32_embedding:
-        #     embeddings = embeddings.half()
-        embeddings = MaskedLayerNorm(self.LayerNorm, embeddings, token_mask)
-        embeddings = self.dropout(embeddings)
-        return {
-                'embeddings': embeddings,
-                'position_embeddings': position_embeddings}
-class GAUEncoder(nn.Module):
-    def __init__(self, config, shift_token=False):
-        super().__init__()
-        layer = GAULayer(config, shift_token=shift_token)
-        self.layer = nn.ModuleList([copy.deepcopy(layer)
-                                    for _ in range(config.num_hidden_layers)])
-    def get_attention_mask(self, attention_mask):
-        if attention_mask.dim() <= 2:
-            extended_attention_mask = attention_mask.unsqueeze(1)
-            attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1)
-            attention_mask = attention_mask #.byte()
-        return attention_mask
-    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, return_att=False, query_states = None, relative_pos=None):
-        all_encoder_layers = []
-        att_matrices = []
-        if isinstance(hidden_states, Sequence):
-            next_kv = hidden_states[0]
-        else:
-            next_kv = hidden_states
-        # rel_embeddings = self.get_rel_embedding()
-        for i, layer_module in enumerate(self.layer):
-            output_states = layer_module(next_kv, attention_mask, query_states = query_states, relative_pos=relative_pos)
-            if return_att:
-                    output_states, att_m = output_states
-            # if i == 0 and self.with_conv:
-            #     prenorm = output_states #output['prenorm_states']
-            #     output_states = self.conv(hidden_states, prenorm, input_mask)
-            if query_states is not None:
-                query_states = output_states
-                if isinstance(hidden_states, Sequence):
-                    next_kv = hidden_states[i+1] if i+1 < len(self.layer) else None
-            else:
-                next_kv = output_states
-            if output_all_encoded_layers:
-                all_encoder_layers.append(output_states)
-                if return_att:
-                    att_matrices.append(att_m)
-        if not output_all_encoded_layers:
-            all_encoder_layers.append(output_states)
-            if return_att:
-                    att_matrices.append(att_m)
-        return {
-            'hidden_states': all_encoder_layers,
-            'attention_matrices': att_matrices
-            }
-class FlashEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__(config)
-        layer = GateAttentionUnit(config.max_position_embeddings, config.hidden_size)
-        self.layer = nn.ModuleList([copy.deepcopy(layer)
-                                    for _ in range(config.num_hidden_layers)])
-    def forward(self, hidden_states, attention_mask, token_mask=None,
-                    output_all_encoded_layers=True,
-                    prev_embedding=None, prev_encoded_layers=None, mask_qkv=None, seg_ids=None):
-        # history embedding and encoded layer must be simultanously given
-        assert (prev_embedding is None) == (prev_encoded_layers is None)
-        all_encoder_layers = []
-        if (prev_embedding is not None) and (prev_encoded_layers is not None):
-            history_states = prev_embedding
-            for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(
-                    hidden_states, attention_mask, history_states=history_states, mask_qkv=mask_qkv, seg_ids=seg_ids)
-                if output_all_encoded_layers:
-                    all_encoder_layers.append(hidden_states)
-                if prev_encoded_layers is not None:
-                    history_states = prev_encoded_layers[i]
-        else:
-            for layer_module in self.layer:
-                hidden_states = layer_module(
-                    hidden_states, attention_mask=attention_mask, mask_qkv=mask_qkv, seg_ids=seg_ids)
-                if output_all_encoded_layers:
-                    all_encoder_layers.append(hidden_states)
-        if not output_all_encoded_layers:
-            all_encoder_layers.append(hidden_states)
-        return all_encoder_layers
-# class FlashQuadModel(BertModel):
-#     def __init__(self, config, pooler=False, shift_token=False, causal=False) -> None:
-#         super().__init__(config)
-#         self.embeddings = FlashEmbeddings(config)
-#         self.encoder = GAUEncoder(config, causal=causal, shift_token=shift_token)
-#         if not pooler:
-#             self.pooler = None
-#         self.apply(self.init_bert_weights)
-class FlashQuadModel(torch.nn.Module):
-    """
-    Parameters:
-        config:
-        A model config class instance with the configuration to build a new model. The schema is similar to `BertConfig`,
-        pre_trained:
-        The pre-trained DeBERTa model, it can be a physical path of a pre-trained DeBERTa model or a released configurations,
-            i.e. [**base, large, base_mnli, large_mnli**]
-    """
-    def __init__(self, config=None, pre_trained=None, pooler=False, shift_token=False, causal=False, **kwargs):
-        super().__init__()
-        state = None
-        if pre_trained is not None:
-            state, model_config = load_model_state(pre_trained)
-            if config is not None and model_config is not None:
-                for k in config.__dict__:
-                    if k not in ['hidden_size',
-                            'intermediate_size',
-                            'num_attention_heads',
-                            'num_hidden_layers',
-                            'vocab_size',
-                            'max_position_embeddings']:
-                            model_config.__dict__[k] = config.__dict__[k]
-            config = copy.copy(model_config)
-        self.embeddings = FlashEmbeddings(config, with_position=True)
-        self.encoder = GAUEncoder(config, shift_token=shift_token)
-        if not pooler:
-            self.pooler = None
-        self.config = config
-        self.pre_trained = pre_trained
-        self.apply_state(state)
-    def get_attention_mask(self, input_ids=None, token_type_ids=None, attention_mask=None, input_mask=None):
-        if attention_mask is None:
-            if input_mask is not None:
-                return input_mask.unsqueeze(-1).expand(input_mask.size(0), input_mask.size(1), input_mask.size(1))
-            else:
-                return torch.ones_like(input_ids, dtype=torch.uint8).unsqueeze(-1).expand(input_mask.size(0), input_mask.size(1), input_mask.size(1))
-        else:
-            if attention_mask.dim() == 2:
-                if input_mask is not None:
-                    attention_mask = attention_mask * input_mask
-                return attention_mask.unsqueeze(-1).expand(input_mask.size(0), input_mask.size(1), attention_mask.size(-1))
-            if attention_mask.dim() == 4:
-                attention_mask = attention_mask.squeeze(2)
-            if attention_mask.dim() == 3:
-                if input_mask is not None:
-                    return attention_mask * input_mask.unsqueeze(-1).expand(input_mask.size(0), input_mask.size(1), attention_mask.size(-1))
-                else:
-                    return attention_mask
-    def forward(self, input_ids, input_mask, attention_mask=None, token_type_ids=None,
-                output_all_encoded_layers=True, position_ids=None, return_att=False):
-        """
-        Args:
-            input_ids:
-                a torch.LongTensor of shape [batch_size, sequence_length] \
-            with the word token indices in the vocabulary
-            attention_mask:
-                an optional parameter for input mask or attention mask.
-                - If it's an input mask, then it will be torch.LongTensor of shape [batch_size, sequence_length] with indices \
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
-            input sequence length in the current batch. It's the mask that we typically use for attention when \
-            a batch has varying length sentences.
-                - If it's an attention mask then it will be torch.LongTensor of shape [batch_size, sequence_length, sequence_length]. \
-            In this case, it's a mask indicate which tokens in the sequence should be attended by other tokens in the sequence.
-            token_type_ids:
-                an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
-            a `sentence B` token (see BERT paper for more details).
-            output_all_encoded_layers:
-                whether to output results of all encoder layers, default, True
-            Returns:
-            - The output of the stacked transformer layers if `output_all_encoded_layers=True`, else \
-            the last layer of stacked transformer layers
-            - Attention matrix of self-attention layers if `return_att=True`
-            Example::
-            # Batch of wordPiece token ids.
-            # Each sample was padded with zero to the maxium length of the batch
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            # Mask of valid input ids
-            attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            # DeBERTa model initialized with pretrained base model
-            bert = DeBERTa(pre_trained='base')
-            encoder_layers = bert(input_ids, attention_mask=attention_mask)
-            """
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-            # input_mask = torch.ones_like(input_ids)
-        if input_mask is None:
-            idxs = torch.flip(torch.cumsum(torch.flip(token_type_ids, [-1]), axis=1), [-1])
-            input_mask = idxs > 0
-            if not torch.any(input_mask):
-                input_mask = torch.ones_like(input_ids)
-            input_mask = input_mask # .byte()
-        attention_mask = self.get_attention_mask(input_ids, token_type_ids, attention_mask, input_mask)
-        attention_mask = attention_mask #.byte()
-        embedding_output = self.embeddings(input_ids.to(torch.long), token_type_ids.to(torch.long), position_ids, input_mask)
-        encoder_output = self.encoder(embedding_output['embeddings'], attention_mask, output_all_encoded_layers=output_all_encoded_layers, return_att = return_att)
-        encoder_output.update(embedding_output)
-        return encoder_output
-    def apply_state(self, state = None):
-        """ Load state from previous loaded model state dictionary.
-        Args:
-            state (:obj:`dict`, optional): State dictionary as the state returned by torch.module.state_dict(), default: `None`. \
-                If it's `None`, then will use the pre-trained state loaded via the constructor to re-initialize \
-                the `DeBERTa` model
-        """
-        if self.pre_trained is None and state is None:
-            return
-        if state is None:
-            state, config = load_model_state(self.pre_trained)
-            self.config = config
-        prefix = ''
-        for k in state:
-            if 'embeddings.' in k:
-                    if not k.startswith('embeddings.'):
-                        prefix = k[:k.index('embeddings.')]
-                    break
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        self._load_from_state_dict(state, prefix = prefix, local_metadata=None, strict=True, missing_keys=missing_keys, unexpected_keys=unexpected_keys, error_msgs=error_msgs)
-class FlashModel(BertModel):
-    def __init__(self, config) -> None:
-        super().__init__(config)
-        self.encoder = FlashEncoder(config)
-        self.apply(self.init_bert_weights)
-if __name__ == '__main__':
-    model = FlashModel(768, 64)

modeling/focal_loss.py DELETED Viewed

@@ -1,200 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.cuda.amp as amp
-##
-# version 1: use torch.autograd
-class FocalLossV1(nn.Module):
-    def __init__(self,
-                 alpha=0.25,
-                 gamma=2,
-                 reduction='mean',):
-        super(FocalLossV1, self).__init__()
-        self.alpha = alpha
-        self.gamma = gamma
-        self.reduction = reduction
-        self.crit = nn.BCEWithLogitsLoss(reduction='none')
-    def forward(self, logits, label):
-        '''
-        Usage is same as nn.BCEWithLogits:
-            >>> criteria = FocalLossV1()
-            >>> logits = torch.randn(8, 19, 384, 384)
-            >>> lbs = torch.randint(0, 2, (8, 19, 384, 384)).float()
-            >>> loss = criteria(logits, lbs)
-        '''
-        probs = torch.sigmoid(logits)
-        coeff = torch.abs(label - probs).pow(self.gamma).neg()
-        log_probs = torch.where(logits >= 0,
-                F.softplus(logits, -1, 50),
-                logits - F.softplus(logits, 1, 50))
-        log_1_probs = torch.where(logits >= 0,
-                -logits + F.softplus(logits, -1, 50),
-                -F.softplus(logits, 1, 50))
-        loss = label * self.alpha * log_probs + (1. - label) * (1. - self.alpha) * log_1_probs
-        loss = loss * coeff
-        if self.reduction == 'mean':
-            loss = loss.mean()
-        if self.reduction == 'sum':
-            loss = loss.sum()
-        return loss
-##
-# version 2: user derived grad computation
-class FocalSigmoidLossFuncV2(torch.autograd.Function):
-    '''
-    compute backward directly for better numeric stability
-    '''
-    @staticmethod
-    @amp.custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, logits, label, alpha, gamma):
-        #  logits = logits.float()
-        probs = torch.sigmoid(logits)
-        coeff = (label - probs).abs_().pow_(gamma).neg_()
-        log_probs = torch.where(logits >= 0,
-                F.softplus(logits, -1, 50),
-                logits - F.softplus(logits, 1, 50))
-        log_1_probs = torch.where(logits >= 0,
-                -logits + F.softplus(logits, -1, 50),
-                -F.softplus(logits, 1, 50))
-        ce_term1 = log_probs.mul_(label).mul_(alpha)
-        ce_term2 = log_1_probs.mul_(1. - label).mul_(1. - alpha)
-        ce = ce_term1.add_(ce_term2)
-        loss = ce * coeff
-        ctx.vars = (coeff, probs, ce, label, gamma, alpha)
-        return loss
-    @staticmethod
-    @amp.custom_bwd
-    def backward(ctx, grad_output):
-        '''
-        compute gradient of focal loss
-        '''
-        (coeff, probs, ce, label, gamma, alpha) = ctx.vars
-        d_coeff = (label - probs).abs_().pow_(gamma - 1.).mul_(gamma)
-        d_coeff.mul_(probs).mul_(1. - probs)
-        d_coeff = torch.where(label < probs, d_coeff.neg(), d_coeff)
-        term1 = d_coeff.mul_(ce)
-        d_ce = label * alpha
-        d_ce.sub_(probs.mul_((label * alpha).mul_(2).add_(1).sub_(label).sub_(alpha)))
-        term2 = d_ce.mul(coeff)
-        grads = term1.add_(term2)
-        grads.mul_(grad_output)
-        return grads, None, None, None
-class FocalLossV2(nn.Module):
-    def __init__(self,
-                 alpha=0.25,
-                 gamma=2,
-                 reduction='mean'):
-        super(FocalLossV2, self).__init__()
-        self.alpha = alpha
-        self.gamma = gamma
-        self.reduction = reduction
-    def forward(self, logits, label):
-        '''
-        Usage is same as nn.BCEWithLogits:
-            >>> criteria = FocalLossV2()
-            >>> logits = torch.randn(8, 19, 384, 384)
-            >>> lbs = torch.randint(0, 2, (8, 19, 384, 384)).float()
-            >>> loss = criteria(logits, lbs)
-        '''
-        loss = FocalSigmoidLossFuncV2.apply(logits, label, self.alpha, self.gamma)
-        if self.reduction == 'mean':
-            loss = loss.mean()
-        if self.reduction == 'sum':
-            loss = loss.sum()
-        return loss
-if __name__ == '__main__':
-    import torchvision
-    import torch
-    import numpy as np
-    import random
-    torch.manual_seed(15)
-    random.seed(15)
-    np.random.seed(15)
-    torch.backends.cudnn.deterministic = True
-    class Model(nn.Module):
-        def __init__(self):
-            super(Model, self).__init__()
-            net = torchvision.models.resnet18(pretrained=False)
-            self.conv1 = net.conv1
-            self.bn1 = net.bn1
-            self.maxpool = net.maxpool
-            self.relu = net.relu
-            self.layer1 = net.layer1
-            self.layer2 = net.layer2
-            self.layer3 = net.layer3
-            self.layer4 = net.layer4
-            self.out = nn.Conv2d(512, 3, 3, 1, 1)
-        def forward(self, x):
-            feat = self.conv1(x)
-            feat = self.bn1(feat)
-            feat = self.relu(feat)
-            feat = self.maxpool(feat)
-            feat = self.layer1(feat)
-            feat = self.layer2(feat)
-            feat = self.layer3(feat)
-            feat = self.layer4(feat)
-            feat = self.out(feat)
-            out = F.interpolate(feat, x.size()[2:], mode='bilinear', align_corners=True)
-            return out
-    net1 = Model()
-    net2 = Model()
-    net2.load_state_dict(net1.state_dict())
-    criteria1 = FocalLossV2()
-    # criteria2 = FocalLossV3()
-    net1.cuda()
-    net2.cuda()
-    net1.train()
-    net2.train()
-    net1.double()
-    net2.double()
-    criteria1.cuda()
-    # criteria2.cuda()
-    optim1 = torch.optim.SGD(net1.parameters(), lr=1e-2)
-    # optim2 = torch.optim.SGD(net2.parameters(), lr=1e-2)
-    bs = 16
-    for it in range(300000):
-        inten = torch.randn(bs, 3, 224, 244).cuda()
-        #  lbs = torch.randint(0, 2, (bs, 3, 224, 244)).float().cuda()
-        lbs = torch.randn(bs, 3, 224, 244).sigmoid().cuda()
-        inten = inten.double()
-        lbs = lbs.double()
-        logits = net1(inten)
-        loss1 = criteria1(logits, lbs)
-        optim1.zero_grad()
-        loss1.backward()
-        optim1.step()
-        # logits = net2(inten)
-        # loss2 = criteria2(logits, lbs)
-        # optim2.zero_grad()
-        # loss2.backward()
-        # optim2.step()
-        # with torch.no_grad():
-        #     if (it+1) % 50 == 0:
-        #         print('iter: {}, ================='.format(it+1))
-        #         print('out.weight: ', torch.mean(torch.abs(net1.out.weight - net2.out.weight)).item())
-        #         print('conv1.weight: ', torch.mean(torch.abs(net1.conv1.weight - net2.conv1.weight)).item())
-        #         print('loss: ', loss1.item() - loss2.item())

modeling/gat.py DELETED Viewed

@@ -1,665 +0,0 @@
-#
-# Zhoubo
-#
-"""
-    FLASH: https://arxiv.org/abs/2202.10447
-"""
-import copy
-import torch
-import math
-import os
-from collections import Sequence
-import json
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers.activations import ACT2FN
-from .ops import sequence_masking, XSoftmax, StableDropout, MaskedLayerNorm
-from .config import ModelConfig
-from .cache_utils import load_model_state
-import einops
-class ScaleNorm(nn.Module):
-    def __init__(self, eps=1e-5):
-        super().__init__()
-        self.eps = eps
-        self.scala = nn.Parameter(torch.ones(1))
-    def forward(self, x):
-        mean_square = (x ** 2).mean(dim=-1, keepdim=True)
-        x = x * torch.rsqrt(mean_square + self.eps) * self.scala
-        return x
-class BertLayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-5):
-        """Construct a layernorm module in the TF style (epsilon inside the square root).
-        """
-        super(BertLayerNorm, self).__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.bias = nn.Parameter(torch.zeros(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, x):
-        u = x.mean(-1, keepdim=True)
-        s = (x - u).pow(2).mean(-1, keepdim=True)
-        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-        return self.weight * x + self.bias
-class ScaledSinuEmbedding(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(1,))
-        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
-    def forward(self, x):
-        n, device = x.shape[1], x.device
-        t = torch.arange(n, device = device).type_as(self.inv_freq)
-        sinu = torch.einsum('i , j -> i j', t, self.inv_freq)
-        emb = torch.cat((sinu.sin(), sinu.cos()), dim = -1)
-        return emb * self.scale
-def RoPE(x, dim):
-    """
-    :param x: input tensor
-    :param dim: oprate dimension
-    :return: tensor
-    """
-    shape = x.shape
-    if isinstance(dim, int):
-        dim = [dim]
-    spatial_shape = [shape[i] for i in dim]
-    total_len = 1
-    for i in spatial_shape:
-        total_len *= i
-    position = torch.reshape(torch.arange(total_len, dtype=torch.float, device=x.device), spatial_shape)
-    for i in range(dim[-1] + 1, len(shape) - 1, 1):
-        position = torch.unsqueeze(position, dim=-1)
-    half_size = shape[-1] // 2
-    freq_seq = -torch.arange(half_size, dtype=torch.float, device=x.device) / float(half_size)
-    inv_freq = 10000 ** -freq_seq
-    sinusoid = torch.einsum("...,d->...d", position, inv_freq)
-    sin = torch.sin(sinusoid)
-    cos = torch.cos(sinusoid)
-    x1, x2 = torch.chunk(x, 2, dim=-1)
-    return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1)
-def rel_pos_bias(seq_len, s):
-    a = torch.rand([1, s], dtype=torch.float)
-    b = torch.rand([1, s], dtype=torch.float)
-    w = torch.rand([2 * seq_len - 1], dtype=torch.float)
-    if seq_len <= 512:
-        t = F.pad(w[: 2 * seq_len - 1], [0, seq_len]).repeat(seq_len)
-        t = t[..., :-seq_len].reshape(-1, seq_len, 3 * seq_len - 2)
-        r = (2 * seq_len - 1) // 2
-        t = t[..., r:-r]
-    else:
-        a = RoPE(a.repeat(seq_len, 1), dim=[0])
-        b = RoPE(b.repeat(seq_len, 1), dim=[0])
-        t = torch.einsum("mk,nk->mn", a, b)
-    return t
-def squared_relu(x, attention_mask, dim=-1):
-    rmask = ~(attention_mask.bool())
-    x = x.masked_fill(rmask, 0)
-    return torch.square(F.relu(x))
-def attention_normalize(a, axis=-1, mask=None, fn='softmax'):
-    if fn == 'softmax':
-        return XSoftmax.apply(a, mask, axis)
-    else:
-        mask_ = a > -float('inf') / 10
-        # mask_ = mask_.byte()
-        mask_ = torch.sum(mask_, axis=axis, keepdim=True)
-        l = torch.maximum(mask_, torch.ones_like(mask_))
-        if fn == 'relu':
-            rmask = ~(mask.bool())
-            a = a.masked_fill(rmask, 0)
-            return torch.square(F.relu(a)) / l
-        elif fn == 'softmax_plus':
-            return XSoftmax.apply(a * torch.log(l) / np.log(512), mask, axis)
-    return a
-class GAULinear(nn.Linear):
-    def init_weight(self):
-        nn.init.xavier_uniform_(self.weight)
-class GatedAttentionUnit(nn.Module):
-    """
-    GAU Block: Gate Attention Unit
-    """
-    def __init__(
-        self,
-        max_seq_length,
-        hidden_size,
-        attention_key_size=128,
-        activation='swish',
-        use_bias=True,
-        attention_norm_type='squared_relu',
-        attention_scale=True,
-        dropout=0.1,
-        pre_norm=False,
-        norm_type="layer_norm",
-        eps=1e-5,
-        shift_token=False,
-        use_rel_bias=False,
-        add_residual=True,
-        **kwargs,):
-        super(GatedAttentionUnit, self).__init__(**kwargs)
-        self.max_seq_length = max_seq_length
-        self.units = hidden_size
-        self.intermediate_size = self.units * 2
-        self.key_size = attention_key_size
-        self.activation = activation
-        self.use_bias = use_bias
-        self.attention_norm_type = attention_norm_type
-        self.attention_scale = attention_scale
-        self.dropout = StableDropout(dropout)
-        self.i_dense = nn.Sequential(
-            nn.Linear(self.units, 2 * self.intermediate_size + self.key_size, bias=self.use_bias),
-            nn.SiLU()
-        )
-        self.o_dense = nn.Sequential(
-                        nn.Linear(self.intermediate_size, self.units, bias=self.use_bias),
-                        self.dropout)
-        self.q_scaleoffset = OffsetScale(self.key_size)
-        self.k_scaleoffset = OffsetScale(self.key_size)
-        self.pre_norm = pre_norm
-        self.norm = (nn.LayerNorm(hidden_size, eps=eps) if norm_type.lower() == "layer_norm" else ScaleNorm(eps=eps))
-        self.add_residual = add_residual
-    def forward(self, x, attention_mask=None, **kwargs):
-        shortcut = x
-        if self.pre_norm:
-            x = self.norm(x)
-        x = self.i_dense(x)
-        u, v, qk = torch.split(x, [self.intermediate_size, self.intermediate_size, self.key_size], dim=-1)
-        q, k = self.q_scaleoffset(qk), self.k_scaleoffset(qk)
-        qk = RoPE(torch.stack([q, k], 2), dim=1)
-        q, k = qk[:, :, 0], qk[:, :, 1]
-        a = torch.einsum('bmd,bnd->bmn', q, k)
-        if self.attention_scale:
-            a = a / self.key_size**0.5
-        a = sequence_masking(a, attention_mask, '-inf', -1)
-        A = attention_normalize(a, -1, fn=self.attention_norm_type)
-        if self.dropout:
-            A = self.dropout(A)
-        out = self.o_dense(u * torch.einsum('bmn,bnd->bmd', A, v))
-        if self.add_residual:
-            out = out + shortcut
-        if not self.pre_norm:
-            out = self.norm(out)
-        return out
-class OffsetScale(nn.Module):
-    def __init__(self, dim, heads = 1):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.ones(heads, dim))
-        self.beta = nn.Parameter(torch.zeros(heads, dim))
-        # nn.init.normal_(self.gamma, std = 0.02)
-        nn.init.xavier_uniform_(self.gamma)
-    def forward(self, x):
-        out = torch.einsum('... d, h d -> ... h d', x, self.gamma) + self.beta
-        return out
-class BertIntermediate(nn.Module):
-  def __init__(self, config):
-    super().__init__()
-    self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-    self.intermediate_act_fn = ACT2FN[config.hidden_act] \
-      if isinstance(config.hidden_act, str) else config.hidden_act
-  def forward(self, hidden_states):
-    hidden_states = self.dense(hidden_states)
-    hidden_states = self.intermediate_act_fn(hidden_states)
-    return hidden_states
-class BertOutput(nn.Module):
-  def __init__(self, config):
-    super(BertOutput, self).__init__()
-    self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-    self.LayerNorm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
-    self.config = config
-  def forward(self, hidden_states, input_states, mask=None):
-    hidden_states = self.dense(hidden_states)
-    hidden_states = self.dropout(hidden_states)
-    hidden_states += input_states
-    hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
-    return hidden_states
-class GAU(nn.Module):
-    def __init__(self, max_seq_length, hidden_size, expansion_factor=2, s=128, norm_type="layer_norm", eps=1e-5,
-                 hidden_act="silu", shift_token=False, use_rel_bias=False, attention_norm_type='softmax',
-                 pre_norm=False, dropout=0, add_residual = True):
-        super(GAU, self).__init__()
-        self.max_seq_length = max_seq_length
-        self.shift_token = shift_token
-        hidden_dim = int(expansion_factor * hidden_size)
-        self.norm = (nn.LayerNorm(hidden_size, eps=eps) if norm_type == "layer_norm" else ScaleNorm(eps=eps))
-        self.use_rel_bias = use_rel_bias
-        self.attention_norm_type = attention_norm_type
-        # if attention_norm_type == 'relu':
-        #     self.attention_norm_func = squared_relu
-        # else:
-        #     self.attention_norm_func = XSoftmax.apply
-        # self.norm = norm_klass(hidden_size)
-        self.dropout = nn.Dropout(dropout)
-        self.to_hidden = nn.Sequential(
-            nn.Linear(hidden_size, hidden_dim * 2),
-            nn.SiLU()
-        )
-        self.to_qk = nn.Sequential(
-            nn.Linear(hidden_size, s),
-            nn.SiLU()
-        )
-        self.offsetscale = OffsetScale(s, heads = 2)
-        self.to_out = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_size),
-            nn.Dropout(dropout)
-        )
-        self.add_residual = add_residual
-        self.act_fn = ACT2FN[hidden_act]
-        self.pre_norm = pre_norm
-    def forward(
-        self,
-        x,
-        relative_pos = None,
-        attention_mask = None
-    ):
-        seq_len, device = x.shape[-2], x.device
-        if self.pre_norm:
-            normed_x = self.norm(x)
-        else:
-            normed_x = x
-        v, gate = self.to_hidden(normed_x).chunk(2, dim = -1)
-        qk = self.to_qk(normed_x)
-        base = self.offsetscale(qk)
-        base = RoPE(base, 1).half()
-        q, k = base.unbind(dim = -2)
-        sim = torch.einsum('b i d, b j d -> b i j', q, k)
-        if relative_pos is not None:
-            sim = sim + relative_pos
-        if attention_mask is not None:
-            if attention_mask.dim() < 3:
-                attention_mask = einops.rearrange(attention_mask, 'b j -> b 1 j')
-            # attn = attn.masked_fill(~attention_mask.bool(), 0.)
-        attn = attention_normalize(sim, mask=attention_mask, fn=self.attention_norm_type)
-        # attn = F.relu(sim) ** 2 / seq_len# / q.size(-1)
-        # logger.info(attn.max())
-        attn = self.dropout(attn)
-        # if self.causal:
-        #     causal_mask = torch.ones((seq_len, seq_len), dtype = torch.bool, device = device).triu(1)
-        #     attn = attn.masked_fill(causal_mask, 0.)
-        out = torch.einsum('b i j, b j d -> b i d', attn.half(), v)
-        out = out * gate
-        out = self.to_out(out)
-        if self.add_residual:
-            out = out + x
-        if not self.pre_norm:
-            out = self.norm(out)
-        return out
-class GatLayer(nn.Module):
-    def __init__(self, config, shift_token=False, use_ffn=False):
-        super(GatLayer, self).__init__()
-        self.attention = GatedAttentionUnit(config.max_position_embeddings, config.hidden_size,
-                                    shift_token=shift_token, use_rel_bias=config.use_rel_bias,
-                                    norm_type=config.norm_type, attention_norm_type=config.attention_norm_type,
-                                    pre_norm=config.pre_norm, dropout=config.hidden_dropout_prob)
-        if use_ffn:
-            self.intermediate = BertIntermediate(config)
-            self.output = BertOutput(config)
-        self.use_ffn = use_ffn
-    def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
-        attention_output = self.attention(hidden_states, attention_mask=attention_mask, relative_pos=relative_pos)
-        if self.use_ffn:
-            intermediate_output = self.intermediate(attention_output)
-            layer_output = self.output(intermediate_output, attention_output)
-            return layer_output
-        else:
-            return attention_output
-class RelativePositionBias(nn.Module):
-    def __init__(
-        self,
-        scale,
-        causal = False,
-        num_buckets = 32,
-        max_distance = 128
-    ):
-        super().__init__()
-        self.scale = scale
-        self.causal = causal
-        self.num_buckets = num_buckets
-        self.max_distance = max_distance
-        self.relative_attention_bias = nn.Embedding(num_buckets, 1)
-    @staticmethod
-    def _relative_position_bucket(
-        relative_position,
-        causal = True,
-        num_buckets = 32,
-        max_distance = 128
-    ):
-        ret = 0
-        n = -relative_position
-        if not causal:
-            num_buckets //= 2
-            ret += (n < 0).long() * num_buckets
-            n = torch.abs(n)
-        else:
-            n = torch.max(n, torch.zeros_like(n))
-        max_exact = num_buckets // 2
-        is_small = n < max_exact
-        val_if_large = max_exact + (
-            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
-        ).long()
-        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
-        ret += torch.where(is_small, n, val_if_large)
-        return ret
-    def forward(self, x):
-        i, j, device = *x.shape[-2:], x.device
-        q_pos = torch.arange(i, dtype = torch.long, device = device)
-        k_pos = torch.arange(j, dtype = torch.long, device = device)
-        rel_pos = einops.rearrange(k_pos, 'j -> 1 j') - einops.rearrange(q_pos, 'i -> i 1')
-        rp_bucket = self._relative_position_bucket(rel_pos, causal = self.causal, num_buckets = self.num_buckets, max_distance = self.max_distance)
-        values = self.relative_attention_bias(rp_bucket)
-        bias = einops.rearrange(values, 'i j 1 -> i j')
-        return bias * self.scale
-class GatEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-    def __init__(self, config, with_position=False):
-        super(GatEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(
-            config.type_vocab_size, config.hidden_size)
-        self.with_position = with_position
-        if with_position:
-            self.position_embeddings = ScaledSinuEmbedding(config.hidden_size)
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5)
-        self.dropout = StableDropout(config.hidden_dropout_prob)
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, token_mask=None):
-        seq_length = input_ids.size(1)
-        if position_ids is None:
-            position_ids = torch.arange(
-                seq_length, dtype=torch.long, device=input_ids.device)
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-        words_embeddings = self.word_embeddings(input_ids)
-        if self.with_position:
-            position_embeddings = self.position_embeddings(words_embeddings)
-        else:
-            position_embeddings = 0
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-        # if self.num_pos_emb > 1:
-        #     num_batch = position_embeddings.size(0)
-        #     num_pos = position_embeddings.size(1)
-        #     position_embeddings = position_embeddings.view(
-        #         num_batch, num_pos, self.num_pos_emb, -1)[torch.arange(0, num_batch).long(), :, task_idx, :]
-        embeddings = words_embeddings + position_embeddings + token_type_embeddings
-        # if self.fp32_embedding:
-        #     embeddings = embeddings.half()
-        embeddings = MaskedLayerNorm(self.LayerNorm, embeddings, token_mask)
-        embeddings = self.dropout(embeddings)
-        return {
-                'embeddings': embeddings,
-                'position_embeddings': position_embeddings}
-class GatEncoder(nn.Module):
-    def __init__(self, config, shift_token=False):
-        super().__init__()
-        layer = GatLayer(config, shift_token=shift_token)
-        self.layer = nn.ModuleList([copy.deepcopy(layer)
-                                    for _ in range(config.num_hidden_layers)])
-    def get_attention_mask(self, attention_mask):
-        if attention_mask.dim() <= 2:
-            extended_attention_mask = attention_mask.unsqueeze(1)
-            attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1)
-            attention_mask = attention_mask.byte()
-        return attention_mask
-    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, return_att=False, query_states = None, relative_pos=None):
-        all_encoder_layers = []
-        att_matrices = []
-        if isinstance(hidden_states, Sequence):
-            next_kv = hidden_states[0]
-        else:
-            next_kv = hidden_states
-        # rel_embeddings = self.get_rel_embedding()
-        for i, layer_module in enumerate(self.layer):
-            output_states = layer_module(next_kv, attention_mask, query_states = query_states, relative_pos=relative_pos)
-            if return_att:
-                    output_states, att_m = output_states
-            # if i == 0 and self.with_conv:
-            #     prenorm = output_states #output['prenorm_states']
-            #     output_states = self.conv(hidden_states, prenorm, input_mask)
-            if query_states is not None:
-                query_states = output_states
-                if isinstance(hidden_states, Sequence):
-                    next_kv = hidden_states[i+1] if i+1 < len(self.layer) else None
-            else:
-                next_kv = output_states
-            if output_all_encoded_layers:
-                all_encoder_layers.append(output_states)
-                if return_att:
-                    att_matrices.append(att_m)
-        if not output_all_encoded_layers:
-            all_encoder_layers.append(output_states)
-            if return_att:
-                    att_matrices.append(att_m)
-        return {
-            'hidden_states': all_encoder_layers,
-            'attention_matrices': att_matrices
-            }
-class GatModel(torch.nn.Module):
-    """
-    Parameters:
-        config:
-        A model config class instance with the configuration to build a new model. The schema is similar to `BertConfig`,
-        pre_trained:
-        The pre-trained DeBERTa model, it can be a physical path of a pre-trained DeBERTa model or a released configurations,
-            i.e. [**base, large, base_mnli, large_mnli**]
-    """
-    def __init__(self, config=None, pre_trained=None, pooler=False, shift_token=False, causal=False, **kwargs):
-        super().__init__()
-        state = None
-        if pre_trained is not None:
-            state, model_config = load_model_state(pre_trained)
-            if config is not None and model_config is not None:
-                for k in config.__dict__:
-                    if k not in ['hidden_size',
-                            'intermediate_size',
-                            'num_attention_heads',
-                            'num_hidden_layers',
-                            'vocab_size',
-                            'max_position_embeddings']:
-                            model_config.__dict__[k] = config.__dict__[k]
-            config = copy.copy(model_config)
-        self.embeddings = GatEmbeddings(config, with_position=True)
-        self.encoder = GatEncoder(config, shift_token=shift_token)
-        if not pooler:
-            self.pooler = None
-        self.config = config
-        self.pre_trained = pre_trained
-        self.apply_state(state)
-    def get_attention_mask(self, input_ids=None, token_type_ids=None, attention_mask=None, input_mask=None):
-        if attention_mask is None:
-            if input_mask is not None:
-                return input_mask.unsqueeze(-1).expand(input_mask.size(0), input_mask.size(1), input_mask.size(1))
-            else:
-                return torch.ones_like(input_ids, dtype=torch.uint8).unsqueeze(-1).expand(input_mask.size(0), input_mask.size(1), input_mask.size(1))
-        else:
-            if attention_mask.dim() == 2:
-                if input_mask is not None:
-                    attention_mask = attention_mask * input_mask
-                return attention_mask.unsqueeze(-1).expand(input_mask.size(0), input_mask.size(1), attention_mask.size(-1))
-            if attention_mask.dim() == 4:
-                attention_mask = attention_mask.squeeze(2)
-            if attention_mask.dim() == 3:
-                if input_mask is not None:
-                    return attention_mask * input_mask.unsqueeze(-1).expand(input_mask.size(0), input_mask.size(1), attention_mask.size(-1))
-                else:
-                    return attention_mask
-    def forward(self, input_ids, input_mask, attention_mask=None, token_type_ids=None,
-                output_all_encoded_layers=True, position_ids=None, return_att=False):
-        """
-        Args:
-            input_ids:
-                a torch.LongTensor of shape [batch_size, sequence_length] \
-            with the word token indices in the vocabulary
-            attention_mask:
-                an optional parameter for input mask or attention mask.
-                - If it's an input mask, then it will be torch.LongTensor of shape [batch_size, sequence_length] with indices \
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
-            input sequence length in the current batch. It's the mask that we typically use for attention when \
-            a batch has varying length sentences.
-                - If it's an attention mask then it will be torch.LongTensor of shape [batch_size, sequence_length, sequence_length]. \
-            In this case, it's a mask indicate which tokens in the sequence should be attended by other tokens in the sequence.
-            token_type_ids:
-                an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
-            a `sentence B` token (see BERT paper for more details).
-            output_all_encoded_layers:
-                whether to output results of all encoder layers, default, True
-            Returns:
-            - The output of the stacked transformer layers if `output_all_encoded_layers=True`, else \
-            the last layer of stacked transformer layers
-            - Attention matrix of self-attention layers if `return_att=True`
-            Example::
-            # Batch of wordPiece token ids.
-            # Each sample was padded with zero to the maxium length of the batch
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            # Mask of valid input ids
-            attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            # DeBERTa model initialized with pretrained base model
-            bert = DeBERTa(pre_trained='base')
-            encoder_layers = bert(input_ids, attention_mask=attention_mask)
-            """
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-            # input_mask = torch.ones_like(input_ids)
-        if input_mask is None:
-            idxs = torch.flip(torch.cumsum(torch.flip(token_type_ids, [-1]), axis=1), [-1])
-            input_mask = idxs > 0
-            if not torch.any(input_mask):
-                input_mask = torch.ones_like(input_ids)
-            input_mask = input_mask.byte()
-        attention_mask = self.get_attention_mask(input_ids, token_type_ids, attention_mask, input_mask)
-        attention_mask = attention_mask.byte()
-        embedding_output = self.embeddings(input_ids.to(torch.long), token_type_ids.to(torch.long), position_ids, input_mask)
-        encoder_output = self.encoder(embedding_output['embeddings'], attention_mask, output_all_encoded_layers=output_all_encoded_layers, return_att = return_att)
-        encoder_output.update(embedding_output)
-        return encoder_output
-    def apply_state(self, state = None):
-        """ Load state from previous loaded model state dictionary.
-        Args:
-            state (:obj:`dict`, optional): State dictionary as the state returned by torch.module.state_dict(), default: `None`. \
-                If it's `None`, then will use the pre-trained state loaded via the constructor to re-initialize \
-                the `DeBERTa` model
-        """
-        if self.pre_trained is None and state is None:
-            return
-        if state is None:
-            state, config = load_model_state(self.pre_trained)
-            self.config = config
-        prefix = ''
-        for k in state:
-            if 'embeddings.' in k:
-                    if not k.startswith('embeddings.'):
-                        prefix = k[:k.index('embeddings.')]
-                    break
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        self._load_from_state_dict(state, prefix = prefix, local_metadata=None, strict=True, missing_keys=missing_keys, unexpected_keys=unexpected_keys, error_msgs=error_msgs)
-if __name__ == '__main__':
-    model = GatModel(768, 64)

modeling/mlm.py DELETED Viewed

@@ -1,38 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) Microsoft, Inc. 2020
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-# This piece of code is modified based on https://github.com/huggingface/transformers
-import torch
-from torch import nn
-import pdb
-from .bert import LayerNorm,ACT2FN
-__all__ = ['MLMPredictionHead']
-class MLMPredictionHead(nn.Module):
-    def __init__(self, config, vocab_size):
-        super().__init__()
-        self.embedding_size = getattr(config, 'embedding_size', config.hidden_size)
-        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
-        self.transform_act_fn = ACT2FN[config.hidden_act] \
-            if isinstance(config.hidden_act, str) else config.hidden_act
-        self.LayerNorm = LayerNorm(self.embedding_size, config.layer_norm_eps)
-        self.bias = nn.Parameter(torch.zeros(vocab_size))
-        self.pre_norm = PreLayerNorm(config)
-    def forward(self, hidden_states, embeding_weight):
-        hidden_states = self.pre_norm(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        # b x s x d
-        hidden_states = MaskedLayerNorm(self.LayerNorm, hidden_states)
-        # b x s x v
-        logits = torch.matmul(hidden_states, embeding_weight.t().to(hidden_states)) + self.bias
-        return logits

modeling/modeling.py DELETED Viewed

The diff for this file is too large to render. See raw diff

modeling/nnmodule.py DELETED Viewed

@@ -1,184 +0,0 @@
-import pdb
-import os
-import torch
-import copy
-from torch import nn, tensor
-from .config import ModelConfig
-from ..utils import xtqdm as tqdm
-from .cache_utils import load_model_state
-from .flash import GAULinear
-from ..utils import get_logger
-logger = get_logger()
-__all__ = ['NNModule']
-def truncated_normal_(shape, mean=0, std=0.09):
-    with torch.no_grad():
-        tensor = torch.zeros(shape)
-        tmp = tensor.new_empty(shape + (4,)).normal_()
-        valid = (tmp < 2) & (tmp > -2)
-        ind = valid.max(-1, keepdim=True)[1]
-        tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
-        tensor.data.mul_(std).add_(mean)
-        return tensor
-class NNModule(nn.Module):
-  """ An abstract class to handle weights initialization and \
-    a simple interface for dowloading and loading pretrained models.
-  Args:
-    config (:obj:`~DeBERTa.deberta.ModelConfig`): The model config to the module
-  """
-  def __init__(self, config, *inputs, **kwargs):
-    super().__init__()
-    self.config = config
-  def init_weights(self, module):
-    """ Apply Gaussian(mean=0, std=`config.initializer_range`) initialization to the module.
-    Args:
-      module (:obj:`torch.nn.Module`): The module to apply the initialization.
-    Example::
-      class MyModule(NNModule):
-        def __init__(self, config):
-          # Add construction instructions
-          self.bert = DeBERTa(config)
-          # Add other modules
-          ...
-          # Apply initialization
-          self.apply(self.init_weights)
-    """
-    if isinstance(module, (nn.Linear, nn.Embedding)):
-      module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-    if isinstance(module, nn.Linear) and module.bias is not None:
-      module.bias.data.zero_()
-  def init_weights_gau(self, module):
-    """ Apply Gaussian(mean=0, std=`config.initializer_range`) initialization to the module.
-    Args:
-      module (:obj:`torch.nn.Module`): The module to apply the initialization.
-    Example::
-      class MyModule(NNModule):
-        def __init__(self, config):
-          # Add construction instructions
-          self.bert = DeBERTa(config)
-          # Add other modules
-          ...
-          # Apply initialization
-          self.apply(self.init_weights)
-    """
-    if isinstance(module, GAULinear):
-      module.init_weight()
-    else:
-      if isinstance(module, (nn.Linear, nn.Embedding)):
-        # module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        module.weight.data.copy_(self.initializer(module.weight.data.shape))
-    if isinstance(module, nn.Linear) and module.bias is not None:
-      module.bias.data.zero_()
-  def initializer(self, shape, dtype=None, order=3, gain=1.0):
-    if shape[1] > 10000 or shape[1] < 10:
-      hidden_size = shape[0]
-    else:
-      hidden_size = shape[1]
-    gain *= self.config.num_hidden_layers ** (-1.0 / order)
-    stddev = 1.13684723 / hidden_size**0.5 * gain
-    return torch.nn.init.trunc_normal_(torch.empty(shape, dtype=dtype), std=stddev)# truncated_normal_(shape, std=stddev)
-  @classmethod
-  def load_model(cls, model_path, model_config=None, tag=None, no_cache=False, cache_dir=None , *inputs, **kwargs):
-    """ Instantiate a sub-class of NNModule from a pre-trained model file.
-    Args:
-      model_path (:obj:`str`): Path or name of the pre-trained model which can be either,
-        - The path of pre-trained model
-        - The pre-trained DeBERTa model name in `DeBERTa GitHub releases <https://github.com/microsoft/DeBERTa/releases>`_, i.e. [**base, base_mnli, large, large_mnli**].
-        If `model_path` is `None` or `-`, then the method will create a new sub-class without initialing from pre-trained models.
-      model_config (:obj:`str`): The path of model config file. If it's `None`, then the method will try to find the the config in order:
-        1. ['config'] in the model state dictionary.
-        2. `model_config.json` aside the `model_path`.
-        If it failed to find a config the method will fail.
-      tag (:obj:`str`, optional): The release tag of DeBERTa, default: `None`.
-      no_cache (:obj:`bool`, optional): Disable local cache of downloaded models, default: `False`.
-      cache_dir (:obj:`str`, optional): The cache directory used to save the downloaded models, default: `None`. If it's `None`, then the models will be saved at `$HOME/.~DeBERTa`
-    Return:
-      :obj:`NNModule` : The sub-class object.
-    """
-    # Load config
-    if model_config:
-      config = ModelConfig.from_json_file(model_config)
-    else:
-      config = None
-    model_config = None
-    model_state = None
-    if (model_path is not None) and (model_path.strip() == '-' or model_path.strip()==''):
-      model_path = None
-    try:
-      model_state, model_config = load_model_state(model_path, tag=tag, no_cache=no_cache, cache_dir=cache_dir)
-    except Exception as exp:
-      raise Exception(f'Failed to get model {model_path}. Exception: {exp}')
-    if config is not None and model_config is not None:
-      for k in config.__dict__:
-        if k not in ['hidden_size',
-          'intermediate_size',
-          'num_attention_heads',
-          'num_hidden_layers',
-          'vocab_size',
-          'max_position_embeddings'] or (k not in  model_config.__dict__) or (model_config.__dict__[k] < 0):
-          model_config.__dict__[k] = config.__dict__[k]
-    if model_config is not None:
-      config = copy.copy(model_config)
-    vocab_size = config.vocab_size
-    # Instantiate model.
-    model = cls(config, *inputs, **kwargs)
-    if not model_state:
-      return model
-    # copy state_dict so _load_from_state_dict can modify it
-    state_dict = model_state.copy()
-    missing_keys = []
-    unexpected_keys = []
-    error_msgs = []
-    metadata = getattr(state_dict, '_metadata', None)
-    def load(module, prefix=''):
-      local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-      module._load_from_state_dict(
-        state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-      for name, child in module._modules.items():
-        if child is not None:
-          load(child, prefix + name + '.')
-    load(model)
-    logger.warning(f'Missing keys: {missing_keys}, unexpected_keys: {unexpected_keys}, error_msgs: {error_msgs}')
-    return model

modeling/ops.py CHANGED Viewed

@@ -7,12 +7,10 @@
 # Date: 01/15/2020
 #
-import pdb
 import math
 from packaging import version
 import torch
 from torch.nn import LayerNorm
-from wywLM.utils.jit_tracing import traceable
 if version.Version(torch.__version__) >= version.Version('1.0.0'):
   from torch import _softmax_backward_data as _softmax_backward_data
@@ -21,7 +19,7 @@ else:
 __all__ = ['StableDropout', 'MaskedLayerNorm', 'XSoftmax', 'ACT2FN', 'LayerNorm']
-@traceable
 class XSoftmax(torch.autograd.Function):
   """ Masked Softmax which is optimized for saving memory
@@ -113,7 +111,7 @@ def get_mask(input, local_context):
   return mask, dropout
-@traceable
 class XDropout(torch.autograd.Function):
   @staticmethod
   def forward(ctx, input, local_ctx):

 # Date: 01/15/2020
 #
 import math
 from packaging import version
 import torch
 from torch.nn import LayerNorm
 if version.Version(torch.__version__) >= version.Version('1.0.0'):
   from torch import _softmax_backward_data as _softmax_backward_data
 __all__ = ['StableDropout', 'MaskedLayerNorm', 'XSoftmax', 'ACT2FN', 'LayerNorm']
 class XSoftmax(torch.autograd.Function):
   """ Masked Softmax which is optimized for saving memory
   return mask, dropout
 class XDropout(torch.autograd.Function):
   @staticmethod
   def forward(ctx, input, local_ctx):

modeling/pretrained_models.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	-
2	-

modeling/wywlm_modeling.py DELETED Viewed

@@ -1,446 +0,0 @@
-# Copyright (c) Microsoft, Inc. 2020
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-#
-# Zhou Bo
-# Date: 01/15/2020
-#
-import copy
-import torch
-import os
-import random
-import json
-from .ops import *
-from .bert import *
-from .bert import BertLayer
-from .config import ModelConfig
-from .cache_utils import load_model_state
-from .nnmodule import NNModule
-# from ..utils.bad_grad_viz import register_hooks
-__all__ = ['WywLM']
-def flatten_states(q_states, mask_index):
-    q_states = q_states.reshape((-1, q_states.size(-1)))
-    q_states = q_states.index_select(0, mask_index)
-    return q_states
-class UGDecoder(torch.nn.Module):
-    def __init__(self, config, vocab_size):
-        super().__init__()
-        self.config = config
-        self.position_biased_input = getattr(config, 'position_biased_input', True)
-        # self.layer = torch.nn.ModuleList([BertLayer(config) for _ in range(2)])
-        # self.causal_mask = torch.tril(torch.ones((input_ids.dim(0), input_ids.dim(1), input_ids.dim(1))), diagonal=0)
-    def forward(self, ctx_layers, word_embedding, input_ids, z_states, attention_mask, \
-                encoder, target_ids=None, relative_pos=None, decode=False, s2s_idx=None):
-        causal_outputs, lm_outputs = self.emd_context_layer(ctx_layers, z_states, attention_mask,
-                                                encoder, target_ids, input_ids,
-                                                relative_pos=relative_pos, decode=decode,
-                                                word_embedding=word_embedding, s2s_idx=s2s_idx)
-        # loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
-        # ctx_layer = mlm_ctx_layers[-1]
-        # lm_logits = lm_logits.view(-1, lm_logits.size(-1))
-        return causal_outputs[-1], lm_outputs[-1]
-    def emd_context_layer(self, encoder_layers, z_states, attention_mask, encoder, target_ids, input_ids,\
-                          relative_pos=None, decode=False, word_embedding=None, s2s_idx=None):
-        # if decode:
-        #     attention_mask = torch.tril(torch.ones((input_ids.shape[0], 1, input_ids.shape[1], input_ids.shape[1])), diagonal=0).to(input_ids.device)
-        # else:
-        if attention_mask.dim()<=2:
-            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            att_mask = extended_attention_mask.byte()
-            attention_mask = att_mask*att_mask.squeeze(-2).unsqueeze(-1)
-        elif attention_mask.dim()==3:
-            attention_mask = attention_mask.unsqueeze(1)
-        if not self.position_biased_input:
-            lm_outputs = []
-            # else:
-            hidden_states = encoder_layers[-2]
-            layers = [encoder.layer[-1] for _ in range(2)]
-            z_states += hidden_states
-            query_states = z_states
-            query_mask = attention_mask
-            rel_embeddings = encoder.get_rel_embedding()
-            for layer in layers:
-                # TODO: pass relative pos ids
-                output = layer(hidden_states, query_mask, return_att=False,
-                            query_states=query_states, relative_pos=relative_pos,
-                            rel_embeddings=rel_embeddings)
-                query_states = output
-                lm_outputs.append(query_states)
-            # if decode:
-            attention_mask = torch.tril(torch.ones((input_ids.shape[0], 1, input_ids.shape[1], input_ids.shape[1])),
-                                        diagonal=0).to(input_ids.device)
-            causal_outputs = []
-            # with torch.no_grad():
-            target_embd = word_embedding(target_ids)
-            target_embd += z_states.detach()
-            # self attention of target
-            output = layers[-2](target_embd, attention_mask, return_att=False,
-                        query_states=target_embd, relative_pos=relative_pos,
-                        rel_embeddings=encoder.get_rel_embedding())
-            causal_outputs.append(output)
-            # cross attention
-            output = layers[-1](output, attention_mask, return_att=False,
-                        query_states=query_states, relative_pos=relative_pos,
-                        rel_embeddings=encoder.get_rel_embedding())
-            causal_outputs.append(output)
-        else:
-            causal_outputs = [encoder_layers[-1]]
-            lm_outputs = [encoder_layers[-1]]
-        return causal_outputs, lm_outputs
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
-    """
-    Shift input ids one token to the right.
-    """
-    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
-    shifted_input_ids[:, 0] = decoder_start_token_id
-    if pad_token_id is None:
-        raise ValueError("self.model.config.pad_token_id has to be defined.")
-    # replace possible -100 values in labels by `pad_token_id`
-    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-    return shifted_input_ids
-class WywLMLoss(torch.nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        self.loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')
-        hidden_size = getattr(config, 'embedding_size', config.hidden_size)
-        self.compare = torch.nn.Linear(hidden_size * 3, 2)
-        # self.mlm_head = BertLMPredictionHead(config, config.vocab_size)
-        self.lm_head = BertLMPredictionHead(config, config.vocab_size)
-    def forward(self, logits, lm_logits, target_ids, dict_pos, input_ids, target_ids_s2s, decode=False, ebd_weight=None, task=0):
-        loss_compare = torch.tensor(0).to(logits).float()
-        mlm_loss = torch.tensor(0).to(logits).float()
-        lm_loss = torch.tensor(0).to(logits).float()
-        # else:
-        if task == 1:
-            compare_logits = []
-            compare_labels = []
-            for bi, sampel_pos in enumerate(dict_pos):
-                num_pos = int((sampel_pos > 0).sum().detach().cpu().numpy() / 4) - 1
-                if num_pos <= 1:
-                    continue
-                for pi in range(num_pos):
-                    pos = sampel_pos[pi]
-                    entry_logits = logits[bi][pos[0]: pos[1]]
-                    desc_logits = logits[bi][pos[2]: pos[3]]
-                    neg_num = random.randint(0, num_pos) # torch.randint(low=0, high=num_pos, size=(1,))
-                    ids_neg = input_ids[bi][sampel_pos[neg_num][0]: sampel_pos[neg_num][1]]
-                    ids_pos = input_ids[bi][pos[0]: pos[1]]
-                    if neg_num == pi or (ids_neg.shape == ids_pos.shape and torch.all(ids_neg == ids_pos)):
-                        neg_num = -1
-                        for ni in range(num_pos):
-                            neg_num = random.randint(0, num_pos)# torch.randint(low=0, high=num_pos, size=(1,))
-                            ids_neg = input_ids[bi][sampel_pos[neg_num][0]: sampel_pos[neg_num][1]]
-                            if neg_num != pi and (ids_neg.shape != ids_pos.shape or not torch.all(ids_neg == ids_pos)):
-                                break
-                            else:
-                                neg_num = -1
-                    if neg_num == -1:
-                        continue
-                    neg_desc_logits = logits[bi][sampel_pos[neg_num][2]: sampel_pos[neg_num][3]]
-                    if torch.any(torch.isnan(neg_desc_logits)):
-                        print('error')
-                    entry_logits = entry_logits.mean(dim=0, keepdim=True).float()
-                    desc_logits = desc_logits.mean(dim=0, keepdim=True).float()
-                    neg_desc_logits = neg_desc_logits.mean(dim=0, keepdim=True).float()
-                    compare_logits.append(torch.concat([entry_logits, desc_logits, entry_logits - desc_logits], dim=1))
-                    compare_logits.append(torch.concat([entry_logits, neg_desc_logits, entry_logits - neg_desc_logits], dim=1))
-                    compare_labels += [1, 0]
-            if len(compare_logits) > 0:
-                compare_logits = torch.concat(compare_logits, dim=0).to(logits.dtype)
-                compare_pred = self.compare(compare_logits)
-                loss_compare = self.loss_fn(compare_pred, torch.tensor(compare_labels, dtype=torch.long, device=compare_logits.device)).mean()
-        if torch.all(loss_compare == 0):
-            entry_logits = logits[0][0].unsqueeze(0)
-            compare_logits = torch.concat([entry_logits, entry_logits, entry_logits - entry_logits], dim=1)
-            compare_pred = self.compare(compare_logits)
-            compare_labels = [1]
-            loss_compare = self.loss_fn(compare_pred, torch.tensor(compare_labels, dtype=torch.long, device=compare_logits.device)).mean()
-        # if decode:
-        # lm_labels = target_ids_s2s.index_select(0, (target_ids_s2s.sum(-1) > 0).nonzero().view(-1)[0])
-        # lm_labels = lm_labels.repeat(logits.shape[0], 1).clone().view(-1)
-        # lm_labels = target_ids_s2s.clone()
-        # target_ids_s2s = shift_tokens_right(target_ids_s2s, 0, 1)
-        # target_ids_s2s.masked_fill_(target_ids_s2s==0, 3)
-        if task == 0:
-            _mask_index = (target_ids_s2s > 0).view(-1).nonzero().view(-1)
-            lm_logits_ = flatten_states(lm_logits, _mask_index)
-            lm_pred = self.lm_head(lm_logits_, ebd_weight).float()
-            lm_labels = target_ids_s2s.clone().reshape(-1)
-            lm_labels = lm_labels.index_select(0, _mask_index)
-            # lm_pred = torch.nn.functional.log_softmax(lm_pred)
-            # lm_loss = torch.nn.functional.nll_loss(lm_pred, lm_labels.long())
-            lm_loss = self.loss_fn(lm_pred, lm_labels.long())
-        # dot = register_hooks(lm_loss)
-        # lm_loss.backward()
-        # dot().save('tmp.dot')
-        _mask_index = (target_ids > 0).view(-1).nonzero().view(-1)
-        mlm_logits = flatten_states(logits, _mask_index)
-        mlm_pred = self.lm_head(mlm_logits, ebd_weight).float()
-        mlm_labels = target_ids.view(-1)
-        mlm_labels = mlm_labels.index_select(0, _mask_index)
-        mlm_loss = self.loss_fn(mlm_pred, mlm_labels.long())
-        return loss_compare, mlm_loss, lm_loss
-class WywLM(torch.nn.Module):
-    """ DeBERTa encoder
-    This module is composed of the input embedding layer with stacked transformer layers with disentangled attention.
-    Parameters:
-        config:
-            A model config class instance with the configuration to build a new model. The schema is similar to `BertConfig`, \
-                    for more details, please refer :class:`~DeBERTa.deberta.ModelConfig`
-        pre_trained:
-            The pre-trained DeBERTa model, it can be a physical path of a pre-trained DeBERTa model or a released configurations, \
-                    i.e. [**base, large, base_mnli, large_mnli**]
-    """
-    def __init__(self, config=None, pre_trained=None):
-        super().__init__()
-        state = None
-        if pre_trained is not None:
-            state, model_config = load_model_state(pre_trained)
-            if config is not None and model_config is not None:
-                for k in config.__dict__:
-                    if k not in ['hidden_size',
-                        'intermediate_size',
-                        'num_attention_heads',
-                        'num_hidden_layers',
-                        'vocab_size',
-                        'max_position_embeddings']:
-                        model_config.__dict__[k] = config.__dict__[k]
-            config = copy.copy(model_config)
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.config = config
-        self.pre_trained = pre_trained
-        self.apply_state(state)
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, output_all_encoded_layers=True, position_ids = None, return_att = False):
-        """
-        Args:
-            input_ids:
-                a torch.LongTensor of shape [batch_size, sequence_length] \
-            with the word token indices in the vocabulary
-            attention_mask:
-                an optional parameter for input mask or attention mask.
-                - If it's an input mask, then it will be torch.LongTensor of shape [batch_size, sequence_length] with indices \
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
-            input sequence length in the current batch. It's the mask that we typically use for attention when \
-            a batch has varying length sentences.
-                - If it's an attention mask then it will be torch.LongTensor of shape [batch_size, sequence_length, sequence_length]. \
-            In this case, it's a mask indicate which tokens in the sequence should be attended by other tokens in the sequence.
-            token_type_ids:
-                an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
-            a `sentence B` token (see BERT paper for more details).
-            output_all_encoded_layers:
-                whether to output results of all encoder layers, default, True
-        Returns:
-            - The output of the stacked transformer layers if `output_all_encoded_layers=True`, else \
-            the last layer of stacked transformer layers
-            - Attention matrix of self-attention layers if `return_att=True`
-        Example::
-            # Batch of wordPiece token ids.
-            # Each sample was padded with zero to the maxium length of the batch
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            # Mask of valid input ids
-            attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            # DeBERTa model initialized with pretrained base model
-            bert = DeBERTa(pre_trained='base')
-            encoder_layers = bert(input_ids, attention_mask=attention_mask)
-        """
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-            token_mask = torch.ones_like(input_ids)
-        else:
-            idxs = torch.flip(torch.cumsum(torch.flip(token_type_ids, [-1]), axis=1), [-1])
-            token_mask = idxs > 0
-            token_mask = token_mask.byte()
-        ebd_output = self.embeddings(input_ids.to(torch.long), token_type_ids.to(torch.long), position_ids, token_mask)
-        embedding_output = ebd_output['embeddings']
-        encoder_output = self.encoder(embedding_output,
-                                     attention_mask,
-                                     output_all_encoded_layers=output_all_encoded_layers, return_att = return_att)
-        encoder_output.update(ebd_output)
-        return encoder_output
-    def apply_state(self, state = None):
-        """ Load state from previous loaded model state dictionary.
-            Args:
-                state (:obj:`dict`, optional): State dictionary as the state returned by torch.module.state_dict(), default: `None`. \
-                        If it's `None`, then will use the pre-trained state loaded via the constructor to re-initialize \
-                        the `DeBERTa` model
-        """
-        if self.pre_trained is None and state is None:
-            return
-        if state is None:
-            state, config = load_model_state(self.pre_trained)
-            self.config = config
-        prefix = ''
-        for k in state:
-            if 'embeddings.' in k:
-                if not k.startswith('embeddings.'):
-                    prefix = k[:k.index('embeddings.')]
-                break
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        self._load_from_state_dict(state, prefix = prefix, local_metadata=None, strict=True, missing_keys=missing_keys, unexpected_keys=unexpected_keys, error_msgs=error_msgs)
-class MaskedLanguageModel(NNModule):
-    """ Masked language model
-    """
-    def __init__(self, config, *wargs, **kwargs):
-        super().__init__(config)
-        self.backbone = WywLM(config)
-        self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
-        self.position_buckets = getattr(config, 'position_buckets', -1)
-        if self.max_relative_positions <1:
-            self.max_relative_positions = config.max_position_embeddings
-        # self.mlm_predictions = UGDecoder(self.backbone.config, self.backbone.embeddings.word_embeddings.weight.size(0))
-        self.lm_predictions = UGDecoder(self.backbone.config, self.backbone.embeddings.word_embeddings.weight.size(0))
-        self.device = None
-        self.loss = WywLMLoss(config)
-        # self.loss_lm = WywLMLoss(config)
-        self.apply(self.init_weights)
-    def forward(self, samples, position_ids=None):
-        task = samples['task']
-        if task == 0:
-            input_ids = samples['s2s_input_ids']
-            type_ids = samples['s2s_token_type_ids']
-            attention_mask = samples['s2s_attention_mask']
-            labels = samples['s2s_masked_lm_labels']
-            dict_pos = samples['dict_pos']
-            s2s_label = samples['s2s_label']
-        else:
-            input_ids = samples['input_ids']
-            type_ids = samples['token_type_ids']
-            attention_mask = samples['attention_mask']
-            labels = samples['masked_lm_labels']
-            dict_pos = samples['dict_pos']
-            s2s_label = samples['s2s_label']
-        if self.device is None:
-            self.device = list(self.parameters())[0].device
-        input_ids = input_ids.to(self.device)
-        type_ids = None
-        lm_labels = labels.to(self.device)
-        s2s_label = s2s_label.to(self.device)
-        attention_mask = attention_mask.to(self.device)
-        encoder_output = self.backbone(input_ids, attention_mask, type_ids, output_all_encoded_layers=True, position_ids = position_ids)
-        encoder_layers = encoder_output['hidden_states']
-        z_states = encoder_output['position_embeddings']
-        ctx_layer = encoder_layers[-1]
-        mlm_loss = torch.tensor(0).to(ctx_layer).float()
-        lm_loss = torch.tensor(0).to(ctx_layer).float()
-        lm_logits = None
-        label_inputs = None
-        loss = torch.tensor(0).to(ctx_layer).float()
-        loss_compare = torch.tensor(0).to(ctx_layer).float()
-        ebd_weight = self.backbone.embeddings.word_embeddings.weight
-        lm_logits, mlm_logits = self.lm_predictions(encoder_layers, self.backbone.embeddings.word_embeddings,
-                                        input_ids, z_states,
-                                        attention_mask, self.backbone.encoder,
-                                        target_ids=lm_labels)
-        # if lm_labels.detach().sum() != 0:
-        loss_compare, mlm_loss, lm_loss = self.loss(mlm_logits,
-                                                    lm_logits,
-                                                    lm_labels,
-                                                    dict_pos,
-                                                    target_ids_s2s=s2s_label,
-                                                    decode=False,
-                                                    ebd_weight=ebd_weight,
-                                                    input_ids=input_ids,
-                                                    task=task)
-        loss = loss_compare * 10 + mlm_loss + lm_loss
-        # if s2s_label.detach().sum() != 0:
-        #     s2s_idx = (s2s_label.sum(-1)>0).nonzero().view(-1)
-        #     s2s_label = s2s_label.index_select(0, s2s_idx)
-        #     # ebd_weight = self.backbone.embeddings.word_embeddings.weight
-        #     # lm_logits = self.lm_predictions(encoder_layers[-3], self.backbone.embeddings.word_embeddings,
-        #     #                                 input_ids.index_select(0, s2s_idx), z_states.index_select(0, s2s_idx),
-        #     #                                 attention_mask.index_select(0, s2s_idx), self.backbone.encoder,
-        #     #                                 target_ids=s2s_label,
-        #     #                                 decode=True, s2s_idx=s2s_idx)
-        #     # lm_logits = encoder_layers[-1].detach().index_select(0, s2s_idx)
-        #     _, lm_loss = self.loss_lm(lm_logits,
-        #                                 s2s_label,
-        #                                 torch.zeros_like(dict_pos),
-        #                                 decode=True,
-        #                                 ebd_weight=ebd_weight,
-        #                                 input_ids=input_ids.index_select(0, s2s_idx))
-            # lm_loss = lm_logits.max()
-            # loss = loss + lm_loss
-        return {
-                'logits' : lm_logits,
-                'labels' : lm_labels,
-                's2s_label': s2s_label,
-                'loss' : loss.float(),
-                'loss_compare': loss_compare.float(),
-                'lm_loss': lm_loss.float(),
-                'mlm_loss': mlm_loss.float()
-            }