qilowoq commited on
Commit
001cc1f
1 Parent(s): e966d03

Upload AbLang

Browse files
Files changed (6) hide show
  1. config.json +25 -0
  2. config.py +33 -0
  3. encoderblocks.py +112 -0
  4. extra_fns.py +26 -0
  5. model.py +54 -0
  6. pytorch_model.bin +3 -0
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ablang-test",
3
+ "architectures": [
4
+ "AbLang"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "auto_map": {
8
+ "AutoConfig": "config.AbLangConfig",
9
+ "AutoModel": "model.AbLang"
10
+ },
11
+ "chain": "heavy",
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 768,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "layer_norm_eps": 1e-12,
18
+ "max_position_embeddings": 160,
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "ptid": 21,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.26.1",
24
+ "vocab_size": 24
25
+ }
config.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ from typing import List
3
+
4
+ class AbLangConfig(PretrainedConfig):
5
+ def __init__(
6
+ self,
7
+ max_position_embeddings: int=160,
8
+ hidden_size: int=768,
9
+ num_hidden_layers: int=12,
10
+ num_attention_heads: int=12,
11
+ attention_probs_dropout_prob: float=0.1,
12
+ intermediate_size: int=3072,
13
+ hidden_act: str="gelu",
14
+ hidden_dropout_prob: float=0.1,
15
+ initializer_range: float=0.02,
16
+ layer_norm_eps: float=1e-12,
17
+ chain: str="heavy",
18
+ **kwargs,
19
+ ):
20
+ self.ptid = 21
21
+ self.vocab_size=24
22
+ self.max_position_embeddings=max_position_embeddings
23
+ self.hidden_size=hidden_size
24
+ self.num_hidden_layers=num_hidden_layers
25
+ self.num_attention_heads=num_attention_heads
26
+ self.attention_probs_dropout_prob=attention_probs_dropout_prob
27
+ self.intermediate_size=intermediate_size
28
+ self.hidden_act=hidden_act
29
+ self.hidden_dropout_prob=hidden_dropout_prob
30
+ self.initializer_range=initializer_range
31
+ self.layer_norm_eps=layer_norm_eps
32
+ self.chain=chain
33
+ super().__init__(**kwargs)
encoderblocks.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from transformers import PreTrainedModel
3
+ from typing import List, Optional, Tuple
4
+ from dataclasses import dataclass
5
+ import torch
6
+ import torch.nn as nn
7
+ from fairseq.modules.multihead_attention import MultiheadAttention
8
+ from .extra_fns import ACT2FN
9
+
10
+
11
+ @dataclass
12
+ class AbRepOutput():
13
+ """
14
+ Dataclass used to store AbRep output.
15
+ """
16
+ last_hidden_states: torch.FloatTensor
17
+ all_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
18
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
19
+
20
+
21
+ class EncoderBlocks(PreTrainedModel):
22
+ """
23
+ Wrapper for multiple EncoderBlocks (or a single).
24
+ """
25
+ def __init__(self, config):
26
+ super().__init__(config)
27
+ self.config = config
28
+ self.Layers = nn.ModuleList([EncoderBlock(config) for _ in range(config.num_hidden_layers)])
29
+
30
+ def forward(self, hidden_states, attention_mask=None, output_attentions=False, output_hidden_states=False):
31
+ all_hidden_states = () if output_hidden_states else None
32
+ all_self_attentions = () if output_attentions else None
33
+ for num_block, a_EncoderBlock in enumerate(self.Layers):
34
+ hidden_states, attentions = a_EncoderBlock(hidden_states, attention_mask, output_attentions)
35
+ if output_hidden_states:
36
+ all_hidden_states = all_hidden_states + (hidden_states,) # Takes out each hidden states after each EncoderBlock
37
+ if output_attentions:
38
+ all_self_attentions = all_self_attentions + (attentions,) # Takes out attention layers for analysis
39
+ return AbRepOutput(last_hidden_states=hidden_states, all_hidden_states=all_hidden_states, attentions=all_self_attentions)
40
+
41
+
42
+ class EncoderBlock(PreTrainedModel):
43
+ """
44
+ Single EncoderBlock.
45
+ An EncoderBlock consists of a MultiHeadAttention and a IntermediateLayer.
46
+ """
47
+ def __init__(self, config):
48
+ super().__init__(config)
49
+ self.MultiHeadAttention = ThirdMultiHeadAttention(config)
50
+ self.MHADropout = nn.Dropout(config.hidden_dropout_prob)
51
+ self.MHALayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
52
+ self.IntermediateLayer = IntermediateLayer(config)
53
+
54
+ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
55
+ MHAoutput, attentions = self.MultiHeadAttention(hidden_states, attention_mask, output_attentions=output_attentions)
56
+ output = self.MHADropout(MHAoutput)
57
+ output = self.MHALayerNorm(output + hidden_states) # HIDDEN_STATES ARE ADDED FOR RESIDUAL BLOCK EFFECT
58
+ output = self.IntermediateLayer(output) # INTERMEDIATELAYER HAS RESIDUAL BLOCK EFFECT INTERNALLY
59
+ return output, attentions
60
+
61
+
62
+ class ThirdMultiHeadAttention(PreTrainedModel):
63
+ """
64
+ New MultiHeadAttention which can return the weights of the individual heads.
65
+ """
66
+ def __init__(self, config):
67
+ super().__init__(config)
68
+ self.Attention = MultiheadAttention(config.hidden_size, config.num_attention_heads, dropout=config.attention_probs_dropout_prob, self_attention=True)
69
+
70
+ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
71
+ hidden_states = torch.transpose(hidden_states, 0, 1)
72
+ # static_kv is only True because there is currently a bug which doesn't return the head weights unaveraged unless its true
73
+ attn_output, attn_weights = self.Attention(hidden_states, hidden_states, hidden_states, key_padding_mask=attention_mask, static_kv=True,
74
+ need_weights=output_attentions, need_head_weights=output_attentions)
75
+ return torch.transpose(attn_output, 0, 1), attn_weights
76
+
77
+
78
+ class OldMultiHeadAttention(PreTrainedModel):
79
+ """
80
+ MultiHeadAttention contains a Scaled Dot Product Attention and a Linear Layer.
81
+ """
82
+ def __init__(self, config):
83
+ super().__init__(config)
84
+ self.Attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, config.attention_probs_dropout_prob)
85
+
86
+ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
87
+ hidden_states = torch.transpose(hidden_states, 0, 1)
88
+ output, attentions = self.Attention(hidden_states, hidden_states, hidden_states, key_padding_mask=attention_mask, need_weights=output_attentions)
89
+ attention_output = torch.transpose(output, 0, 1)
90
+ return attention_output, attentions
91
+
92
+
93
+ class IntermediateLayer(PreTrainedModel):
94
+ """
95
+ Contains an expanding layer, while also functioning as a residual block ending with a drop-norm layer
96
+ """
97
+ def __init__(self, config):
98
+ super().__init__(config)
99
+ self.expand_dense = nn.Linear(config.hidden_size, config.intermediate_size)
100
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
101
+
102
+ self.dense_dense = nn.Linear(config.intermediate_size, config.hidden_size)
103
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
104
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
105
+
106
+ def forward(self, hidden_states):
107
+ output = self.expand_dense(hidden_states)
108
+ output = self.intermediate_act_fn(output)
109
+ output = self.dense_dense(output)
110
+ output = self.dropout(output)
111
+ output = self.LayerNorm(output + hidden_states)
112
+ return output
extra_fns.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import math
3
+
4
+
5
+ def gelu_new(x):
6
+ """
7
+ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
8
+ the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
9
+ """
10
+ return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
11
+
12
+ def gelu_fast(x):
13
+ return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
14
+
15
+ def mish(x):
16
+ return x * torch.tanh(torch.nn.functional.softplus(x))
17
+
18
+ ACT2FN = {
19
+ "relu": torch.nn.functional.relu,
20
+ "gelu": torch.nn.functional.gelu,
21
+ "tanh": torch.tanh,
22
+ "gelu_new": gelu_new,
23
+ "gelu_fast": gelu_fast,
24
+ "mish": mish,
25
+ "sigmoid": torch.sigmoid,
26
+ }
model.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import PreTrainedModel
3
+ from .extra_fns import ACT2FN
4
+ from .encoderblocks import EncoderBlocks
5
+ from .config import AbLangConfig
6
+
7
+ class AbEmbeddings(PreTrainedModel):
8
+ def __init__(self, config):
9
+ super().__init__(config)
10
+ self.pad_token_id = config.ptid
11
+ self.AAEmbeddings = torch.nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.pad_token_id)
12
+ self.PositionEmbeddings = torch.nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=0) # here padding_idx is always 0
13
+ self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
14
+ self.Dropout = torch.nn.Dropout(config.hidden_dropout_prob)
15
+
16
+ def forward(self, src):
17
+ inputs_embeds = self.AAEmbeddings(src)
18
+ position_ids = self.create_position_ids_from_input_ids(src, self.pad_token_id)
19
+ position_embeddings = self.PositionEmbeddings(position_ids)
20
+ embeddings = inputs_embeds + position_embeddings
21
+ return self.Dropout(self.LayerNorm(embeddings))
22
+
23
+ def create_position_ids_from_input_ids(self, input_ids, padding_idx):
24
+ """
25
+ Replace non-padding symbols with their position numbers. Padding idx will get position 0, which will be ignored later on.
26
+ """
27
+ mask = input_ids.ne(padding_idx).int()
28
+ return torch.cumsum(mask, dim=1).long() * mask
29
+
30
+
31
+ class AbLang(PreTrainedModel):
32
+ config_class = AbLangConfig
33
+ def __init__(self, config):
34
+ super().__init__(config)
35
+ self.AbEmbeddings = AbEmbeddings(config)
36
+ self.EncoderBlocks = EncoderBlocks(config)
37
+
38
+ def forward(self, inputs):
39
+ src = self.AbEmbeddings(inputs['input_ids'])
40
+ outputs = self.EncoderBlocks(src, attention_mask=1-inputs['attention_mask'], output_attentions=False)
41
+ return apply_cls_embeddings(inputs, outputs)
42
+
43
+ def apply_cls_embeddings(inputs, outputs):
44
+ mask = inputs['attention_mask'].float()
45
+ d = {k: v for k, v in torch.nonzero(mask).cpu().numpy()} # dict of sep tokens
46
+ # make sep token invisible
47
+ for i in d:
48
+ mask[i, d[i]] = 0
49
+ mask[:, 0] = 0.0 # make cls token invisible
50
+ mask = mask.unsqueeze(-1).expand(outputs.last_hidden_states.size())
51
+ sum_embeddings = torch.sum(outputs.last_hidden_states * mask, 1)
52
+ sum_mask = torch.clamp(mask.sum(1), min=1e-9)
53
+ outputs.last_hidden_states[:, 0, :] = sum_embeddings / sum_mask
54
+ return outputs
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dec3268da263e5c21085a7e736c81b521da2662822c5c86d4024c7e558a1b669
3
+ size 340855773