rooa commited on
Commit
04d6513
1 Parent(s): 70b1103
README.md CHANGED
@@ -1,3 +1,47 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ # XGen
6
+
7
+ Official research release for the family of **XGen** models (`7B`) by Salesforce AI Research:
8
+
9
+ *Title*: [Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length](https://blog.salesforceairesearch.com/xgen/)
10
+
11
+ ## Models
12
+
13
+ * [XGen-7B-4K-Base](https://huggingface.co/Salesforce/xgen-7b-4k-base): Pretrained XGen-7B model trained under 4K sequence length.
14
+ * License: Apache-2.0
15
+ * [XGen-7B-8K-Base](https://huggingface.co/Salesforce/xgen-7b-8k-base): Pretrained XGen-7B model trained under 8K sequence length.
16
+ * License: Apache-2.0
17
+
18
+ The training data for the models are tokenized with OpenAI Tiktoken library.
19
+ To use this model, install tiktoken library via `pip`:
20
+
21
+ ```sh
22
+ pip install tiktoken
23
+ ```
24
+
25
+ The models can be used as auto-regressive samplers as follows:
26
+
27
+ ```python
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained("Salesforce/xgen-7b-8k-base", trust_remote_code=True)
31
+ model = AutoModelForCausalLM.from_pretrained("Salesforce/xgen-7b-8k-base", torch_dtype=torch.bfloat16)
32
+ inputs = tokenizer("The world is", return_tensors="pt")
33
+ sample = model.generate(**inputs, max_length=128)
34
+ print(tokenizer.decode(sample[0]))
35
+ ```
36
+
37
+ ## Citation
38
+
39
+ ```bibtex
40
+ @misc{XGen,
41
+ title={Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length},
42
+ author={Salesforce AI Research},
43
+ howpublished={Salesforce AI Research Blog},
44
+ year={2023},
45
+ url={https://blog.salesforceairesearch.com/xgen-7b/}
46
+ }
47
+ ```
config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "bos_token_id": 1,
6
+ "eos_token_id": 2,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 4096,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 11008,
11
+ "max_position_embeddings": 2048,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 32,
15
+ "pad_token_id": 0,
16
+ "rms_norm_eps": 1e-06,
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "float32",
19
+ "transformers_version": "4.29.2",
20
+ "use_cache": true,
21
+ "vocab_size": 51200
22
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.29.2"
7
+ }
pytorch_model-00001-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a74cc65949d3705fb377be925247410886d7b3fb543491c51aafbd88bafa1bc
3
+ size 9945097125
pytorch_model-00002-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee56d1f63a129f9bed5800fa6f629f90e2fb712bcd1c2afe96cda6feb1fafd8e
3
+ size 9961910848
pytorch_model-00003-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:931442a83a9335d0d0d49854401e2ea40fc43ec25b63a5a64ec0995e9f06b41b
3
+ size 7675918907
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 27582816256
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00003-of-00003.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00003.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
16
+ "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
17
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
18
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
19
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
20
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
21
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
22
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
23
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
24
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
25
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
26
+ "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
27
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
28
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
29
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
30
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
31
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
32
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
33
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
34
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
35
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
36
+ "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
37
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
38
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
39
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
40
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
41
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
42
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
43
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
44
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
45
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
46
+ "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
47
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
48
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
49
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
50
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
51
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
52
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
53
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
54
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
55
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
56
+ "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
57
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
58
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
59
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
60
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
61
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
62
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
63
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
64
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
65
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
66
+ "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
67
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
68
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
69
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
70
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
71
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
72
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
73
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
74
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
75
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
76
+ "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
77
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
78
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
79
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
80
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
81
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
82
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
83
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
84
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
85
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
86
+ "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
87
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
88
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
89
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
90
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
91
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
92
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
93
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
94
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
95
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
96
+ "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
97
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
98
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
99
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
100
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
101
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
102
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
103
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
104
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
105
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
106
+ "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
107
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
108
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
109
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
110
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
111
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
112
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
113
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
114
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
115
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
116
+ "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
117
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
118
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
119
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
120
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
121
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
122
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
123
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
124
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
125
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
126
+ "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
127
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
128
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
129
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
130
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
131
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
132
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
133
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
134
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
135
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
136
+ "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
137
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
138
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
139
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
140
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
141
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
142
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
143
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
144
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
145
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
146
+ "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
147
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
148
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
149
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
150
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
151
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
152
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
153
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
154
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
155
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
156
+ "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
157
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
158
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
159
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
160
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
161
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
162
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
163
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
164
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
165
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
166
+ "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
167
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
168
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
169
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
170
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
171
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
172
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
173
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
174
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
175
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
176
+ "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
177
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
178
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
179
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
180
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
181
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
182
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
183
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
184
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
185
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
186
+ "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
187
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
188
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
189
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
190
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
191
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
192
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
193
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
194
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
195
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
196
+ "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
197
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
198
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
199
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
200
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
201
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
202
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
203
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
204
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
205
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
206
+ "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
207
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
208
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
209
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
210
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
211
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
212
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
213
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
214
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
215
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
216
+ "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
217
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
218
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
219
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
220
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
221
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
222
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
223
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
224
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
225
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
226
+ "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
227
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
228
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
229
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
230
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
231
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
232
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
233
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
234
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
235
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
236
+ "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
237
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
238
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
239
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
240
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
241
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
242
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
243
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
244
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
245
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
246
+ "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
247
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
248
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
249
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
250
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
251
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
252
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
253
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
254
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
255
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
256
+ "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
257
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
258
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
259
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
260
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
261
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
262
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
263
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
264
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
265
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
266
+ "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
267
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
268
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
269
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
270
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
271
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
272
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
273
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
274
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
275
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
276
+ "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
277
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
278
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
279
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
280
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
281
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
282
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
283
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
284
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
285
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
286
+ "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
287
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
288
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
289
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
290
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
291
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
292
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
293
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
294
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
295
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
296
+ "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
297
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
298
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
299
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
300
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
301
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
302
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
303
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
304
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
305
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
306
+ "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
307
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
308
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
309
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
310
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
311
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
312
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
313
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
314
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
315
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
316
+ "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
317
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
318
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
319
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
320
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
321
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
322
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
323
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
324
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
325
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
326
+ "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
327
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
328
+ "model.norm.weight": "pytorch_model-00003-of-00003.bin"
329
+ }
330
+ }
tokenization_xgen.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tokenization classes for xgen."""
2
+
3
+ from typing import List, Optional
4
+
5
+ import tiktoken
6
+
7
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
8
+ from transformers.utils import logging
9
+
10
+ logger = logging.get_logger(__name__)
11
+
12
+ MAX_MODEL_INPUT_SIZES = {
13
+ "Salesforce/xgen-7b-4k-base": 4096,
14
+ "Salesforce/xgen-7b-8k-base": 8192,
15
+ "Salesforce/xgen-7b-4k-inst": 4096,
16
+ "Salesforce/xgen-7b-8k-inst": 8192
17
+ }
18
+
19
+
20
+ def tiktoken_tokenizer(base="gpt2", add_special=True):
21
+ if not add_special:
22
+ return tiktoken.get_encoding(base)
23
+
24
+ def include_whitespace(n_min=2, n_max=20):
25
+ whitespaces = [" " * n for n in reversed(range(n_min, n_max))]
26
+ return whitespaces
27
+
28
+ def include_tabs(n_min=2, n_max=20):
29
+ tabs = ["\t" * n for n in reversed(range(n_min, n_max))]
30
+ return tabs
31
+
32
+ def include_fim_tokens():
33
+ fim_tokens = [
34
+ "<fim_prefix>",
35
+ "<fim_middle>",
36
+ "<fim_suffix>",
37
+ "<fim_pad>",
38
+ "<filename>",
39
+ "<gh_stars>",
40
+ "<issue_start>",
41
+ "<issue_comment>",
42
+ "<issue_closed>",
43
+ "<jupyter_start>",
44
+ "<jupyter_text>",
45
+ "<jupyter_code>",
46
+ "<jupyter_output>",
47
+ "<empty_output>",
48
+ "<commit_before>",
49
+ "<commit_msg>",
50
+ "<commit_after>",
51
+ "<reponame>"
52
+ ]
53
+ return fim_tokens
54
+
55
+ add_whitespaces = include_whitespace(n_min=2, n_max=32)
56
+ add_tabs = include_tabs(n_min=2, n_max=10)
57
+ fim_tokens = include_fim_tokens()
58
+
59
+ tokenizer = tiktoken.get_encoding(base)
60
+
61
+ idx = tokenizer.n_vocab
62
+
63
+ bpe_ranks = tokenizer._mergeable_ranks
64
+
65
+ for wsp in add_whitespaces:
66
+ bpe_ranks[bytes(wsp, 'ascii')] = idx
67
+ idx += 1
68
+ for t in add_tabs:
69
+ bpe_ranks[bytes(t, 'ascii')] = idx
70
+ idx += 1
71
+
72
+ special_tokens = dict()
73
+
74
+ for sp in fim_tokens:
75
+ special_tokens[sp] = idx
76
+ idx += 1
77
+
78
+ # In production, load the arguments directly instead of accessing private attributes
79
+ # See openai_public.py for examples of arguments for specific encodings
80
+ enc = tiktoken.Encoding(
81
+ # If you're changing the set of special tokens, make sure to use a different name
82
+ # It should be clear from the name what behaviour to expect.
83
+ name=base.replace("base", "im"),
84
+ pat_str=tokenizer._pat_str,
85
+ mergeable_ranks=bpe_ranks,
86
+ special_tokens={
87
+ **tokenizer._special_tokens,
88
+ **special_tokens
89
+ }
90
+ )
91
+ return enc
92
+
93
+
94
+ class XgenTokenizer(PreTrainedTokenizer):
95
+ """
96
+ Construct a Xgen tokenizer. Based on byte-level Byte-Pair-Encoding.
97
+ Args:
98
+ vocab_file (`str`):
99
+ Path to the vocabulary file.
100
+ """
101
+ max_model_input_sizes = MAX_MODEL_INPUT_SIZES
102
+ model_input_names = ["input_ids", "attention_mask"]
103
+
104
+ def __init__(
105
+ self,
106
+ pad_token=None,
107
+ add_eos_token=False,
108
+ add_special_tokens=True,
109
+ **kwargs,
110
+ ):
111
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
112
+ super().__init__(
113
+ pad_token=pad_token,
114
+ add_eos_token=add_eos_token,
115
+ add_special_tokens=add_special_tokens,
116
+ **kwargs,
117
+ )
118
+ self.add_eos_token = add_eos_token
119
+ self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
120
+
121
+ @property
122
+ def vocab_size(self):
123
+ """Returns vocab size"""
124
+ return self.encoder.n_vocab
125
+
126
+ def get_vocab(self):
127
+ """Returns vocab as a dict"""
128
+ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
129
+ return vocab
130
+
131
+ def _tokenize(self, text, **kwargs):
132
+ """Returns a tokenized string."""
133
+ return self.encoder.encode(text, allowed_special="all")
134
+
135
+ def _convert_token_to_id(self, token):
136
+ """Converts a token (str) in an id using the vocab."""
137
+ return token
138
+
139
+ def _convert_id_to_token(self, index):
140
+ """Converts an index (integer) in a token (str) using the vocab."""
141
+ return self.encoder.decode_single_token_bytes(index)
142
+
143
+ def _decode(self, token_ids: List[int], skip_special_tokens: bool = False, **kwargs):
144
+ return self.encoder.decode(token_ids)
145
+
146
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
147
+ """Build model inputs from a sequence by appending eos_token_id."""
148
+ eos_token_id = [50256] if self.add_eos_token else []
149
+
150
+ output = token_ids_0 + eos_token_id
151
+
152
+ if token_ids_1 is not None:
153
+ output = output + token_ids_1 + eos_token_id
154
+
155
+ return output
156
+
157
+ def get_special_tokens_mask(
158
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
159
+ already_has_special_tokens: bool = False
160
+ ) -> List[int]:
161
+ """
162
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
163
+ special tokens using the tokenizer `prepare_for_model` method.
164
+ Args:
165
+ token_ids_0 (`List[int]`):
166
+ List of IDs.
167
+ token_ids_1 (`List[int]`, *optional*):
168
+ Optional second list of IDs for sequence pairs.
169
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
170
+ Whether the token list is already formatted with special tokens for the model.
171
+ Returns:
172
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
173
+ """
174
+ if already_has_special_tokens:
175
+ return super().get_special_tokens_mask(
176
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
177
+ )
178
+
179
+ eos_token_id = [1] if self.add_eos_token else []
180
+
181
+ if token_ids_1 is None:
182
+ return ([0] * len(token_ids_0)) + eos_token_id
183
+ return ([0] * len(token_ids_0)) + eos_token_id + ([0] * len(token_ids_1)) + eos_token_id
184
+
185
+ def create_token_type_ids_from_sequences(
186
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
187
+ ) -> List[int]:
188
+ """
189
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
190
+ sequence pair mask has the following format:
191
+ ```
192
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
193
+ | first sequence | second sequence |
194
+ ```
195
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
196
+ Args:
197
+ token_ids_0 (`List[int]`):
198
+ List of ids.
199
+ token_ids_1 (`List[int]`, *optional*):
200
+ Optional second list of IDs for sequence pairs.
201
+ Returns:
202
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
203
+ """
204
+ eos_token_id = [50256] if self.add_eos_token else []
205
+
206
+ output = [0] * len(token_ids_0 + eos_token_id)
207
+
208
+ if token_ids_1 is not None:
209
+ output += [1] * len(token_ids_1 + eos_token_id)
210
+
211
+ return output
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_eos_token": false,
3
+ "add_special_tokens": true,
4
+ "clean_up_tokenization_spaces": true,
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": null,
7
+ "tokenizer_class": "XgenTokenizer",
8
+ "auto_map": {
9
+ "AutoTokenizer": ["tokenization_xgen.XgenTokenizer", null]
10
+ }
11
+ }