yuewang-sf commited on
Commit
ad0691e
1 Parent(s): c4b400a

update model files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "[CDEC]": 32102,
3
+ "[ENC]": 32100,
4
+ "[TDEC]": 32101
5
+ }
config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Salesforce/codet5p-110m-embedding",
3
+ "architectures": [
4
+ "CodeT5p_Embedding"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_codet5p_embedding.CodeT5pEmbeddingConfig",
8
+ "AutoModel": "modeling_codet5p_embedding.CodeT5pEmbeddingModel"
9
+ },
10
+ "bos_token_id": 1,
11
+ "d_ff": 3072,
12
+ "d_kv": 64,
13
+ "d_model": 768,
14
+ "embed_dim": 256,
15
+ "decoder_start_token_id": 0,
16
+ "dense_act_fn": "relu",
17
+ "dropout_rate": 0.1,
18
+ "eos_token_id": 2,
19
+ "feed_forward_proj": "relu",
20
+ "gradient_checkpointing": false,
21
+ "id2label": {
22
+ "0": "LABEL_0"
23
+ },
24
+ "initializer_factor": 1.0,
25
+ "is_encoder_decoder": true,
26
+ "is_gated_act": false,
27
+ "label2id": {
28
+ "LABEL_0": 0
29
+ },
30
+ "layer_norm_epsilon": 1e-06,
31
+ "model_type": "codet5p_embedding",
32
+ "n_positions": 512,
33
+ "num_heads": 12,
34
+ "num_layers": 12,
35
+ "output_past": true,
36
+ "pad_token_id": 0,
37
+ "relative_attention_max_distance": 128,
38
+ "relative_attention_num_buckets": 32,
39
+
40
+ "torch_dtype": "float32",
41
+ "transformers_version": "4.21.3",
42
+ "use_cache": true,
43
+ "vocab_size": 32103
44
+ }
configuration_codet5p_embedding.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
3
+
4
+ """ CodeT5+ embedding model configuration"""
5
+ from transformers.configuration_utils import PretrainedConfig
6
+ from transformers.utils import logging
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+
11
+ class CodeT5pEmbeddingConfig(PretrainedConfig):
12
+ model_type = "codet5p_embedding"
13
+ keys_to_ignore_at_inference = ["past_key_values"]
14
+ attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
15
+
16
+ def __init__(
17
+ self,
18
+ vocab_size=32103,
19
+ d_model=768,
20
+ embed_dim=256,
21
+ d_kv=64,
22
+ d_ff=3072,
23
+ num_layers=12,
24
+ num_heads=12,
25
+ relative_attention_num_buckets=32,
26
+ relative_attention_max_distance=128,
27
+ dropout_rate=0.1,
28
+ layer_norm_epsilon=1e-6,
29
+ initializer_factor=1.0,
30
+ feed_forward_proj="relu",
31
+ is_encoder_decoder=False,
32
+ use_cache=True,
33
+ pad_token_id=0,
34
+ eos_token_id=2,
35
+ **kwargs
36
+ ):
37
+ self.vocab_size = vocab_size
38
+ self.d_model = d_model
39
+ self.embed_dim = embed_dim
40
+ self.d_kv = d_kv
41
+ self.d_ff = d_ff
42
+ self.num_layers = num_layers
43
+ self.num_heads = num_heads
44
+ self.relative_attention_num_buckets = relative_attention_num_buckets
45
+ self.relative_attention_max_distance = relative_attention_max_distance
46
+ self.dropout_rate = dropout_rate
47
+ self.layer_norm_epsilon = layer_norm_epsilon
48
+ self.initializer_factor = initializer_factor
49
+ self.feed_forward_proj = feed_forward_proj
50
+ self.use_cache = use_cache
51
+
52
+ act_info = self.feed_forward_proj.split("-")
53
+ self.dense_act_fn = act_info[-1]
54
+ self.is_gated_act = act_info[0] == "gated"
55
+
56
+ if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
57
+ raise ValueError(
58
+ f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
59
+ "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
60
+ "'gated-gelu' or 'relu'"
61
+ )
62
+
63
+ # for backwards compatibility
64
+ if feed_forward_proj == "gated-gelu":
65
+ self.dense_act_fn = "gelu_new"
66
+
67
+ super().__init__(
68
+ pad_token_id=pad_token_id,
69
+ eos_token_id=eos_token_id,
70
+ is_encoder_decoder=is_encoder_decoder,
71
+ **kwargs,
72
+ )
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_codet5p_embedding.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
3
+ """ PyTorch CodeT5+ mbedding models.
4
+ The implementation is based on transformers.models.t5.modeling_t5 by adding a projection layer on T5EncoderModel
5
+ """
6
+
7
+ from typing import Optional, Tuple, Union
8
+ import torch
9
+ from torch import nn
10
+ import torch.nn.functional as F
11
+ from transformers import T5EncoderModel
12
+ from transformers.modeling_outputs import (
13
+ BaseModelOutput,
14
+ )
15
+ from .configuration_codet5p_embedding import CodeT5pEmbeddingConfig
16
+
17
+
18
+ class CodeT5pEmbeddingModel(T5EncoderModel):
19
+ config_class = CodeT5pEmbeddingConfig
20
+
21
+ authorized_missing_keys = [
22
+ r"encoder.embed_tokens.weight",
23
+ ]
24
+
25
+ def __init__(self, config: CodeT5pEmbeddingConfig):
26
+ super().__init__(config)
27
+ self.proj = nn.Linear(config.d_model, config.embed_dim)
28
+
29
+ def forward(
30
+ self,
31
+ input_ids: Optional[torch.LongTensor] = None,
32
+ attention_mask: Optional[torch.FloatTensor] = None,
33
+ head_mask: Optional[torch.FloatTensor] = None,
34
+ inputs_embeds: Optional[torch.FloatTensor] = None,
35
+ output_attentions: Optional[bool] = None,
36
+ output_hidden_states: Optional[bool] = None,
37
+ return_dict: Optional[bool] = None,
38
+ ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
39
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
40
+
41
+ encoder_outputs = self.encoder(
42
+ input_ids=input_ids,
43
+ attention_mask=attention_mask,
44
+ inputs_embeds=inputs_embeds,
45
+ head_mask=head_mask,
46
+ output_attentions=output_attentions,
47
+ output_hidden_states=output_hidden_states,
48
+ return_dict=return_dict,
49
+ )
50
+
51
+ embedding = F.normalize(self.proj(encoder_outputs.last_hidden_state[:, 0, :]), dim=-1)
52
+
53
+ return embedding
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:097d1bd8c5df11eb82aa5b750d208eee17d570babf94c77158279dc992c6829b
3
+ size 439257889
special_tokens_map.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[ENC]",
4
+ "[TDEC]",
5
+ "[CDEC]"
6
+ ],
7
+ "bos_token": {
8
+ "content": "<s>",
9
+ "lstrip": false,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "cls_token": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "eos_token": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "mask_token": {
29
+ "content": "<mask>",
30
+ "lstrip": true,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": {
36
+ "content": "<pad>",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ },
42
+ "sep_token": {
43
+ "content": "</s>",
44
+ "lstrip": false,
45
+ "normalized": true,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ },
49
+ "unk_token": {
50
+ "content": "<unk>",
51
+ "lstrip": false,
52
+ "normalized": true,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ }
56
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "clean_up_tokenization_spaces": true,
12
+ "cls_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "eos_token": {
21
+ "__type": "AddedToken",
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "errors": "replace",
29
+ "mask_token": {
30
+ "__type": "AddedToken",
31
+ "content": "<mask>",
32
+ "lstrip": true,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "model_max_length": 512,
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": {
57
+ "__type": "AddedToken",
58
+ "content": "<unk>",
59
+ "lstrip": false,
60
+ "normalized": true,
61
+ "rstrip": false,
62
+ "single_word": false
63
+ }
64
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff