2264K commited on
Commit
bb01e25
·
verified ·
1 Parent(s): 53386d1

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</arg_key>": 128045,
3
+ "</arg_value>": 128047,
4
+ "</think>": 128041,
5
+ "</tool_call>": 128043,
6
+ "</tool_response>": 128049,
7
+ "</tools>": 128051,
8
+ "<EMAIL>": 128037,
9
+ "<KEY>": 128038,
10
+ "<NAME>": 128036,
11
+ "<PASSWORD>": 128039,
12
+ "<arg_key>": 128044,
13
+ "<arg_value>": 128046,
14
+ "<code_to_intermediate>": 128018,
15
+ "<empty_output>": 128017,
16
+ "<file_sep>": 128008,
17
+ "<intermediate_to_code>": 128019,
18
+ "<issue_closed>": 128011,
19
+ "<issue_comment>": 128010,
20
+ "<issue_start>": 128009,
21
+ "<jupyter_code>": 128014,
22
+ "<jupyter_output>": 128015,
23
+ "<jupyter_script>": 128016,
24
+ "<jupyter_start>": 128012,
25
+ "<jupyter_text>": 128013,
26
+ "<pr>": 128020,
27
+ "<pr_base>": 128023,
28
+ "<pr_base_code>": 128025,
29
+ "<pr_comment>": 128028,
30
+ "<pr_diff>": 128026,
31
+ "<pr_diff_hunk>": 128027,
32
+ "<pr_diff_hunk_comment_line>": 128035,
33
+ "<pr_event_id>": 128029,
34
+ "<pr_file>": 128024,
35
+ "<pr_in_reply_to_comment_id>": 128034,
36
+ "<pr_in_reply_to_review_id>": 128033,
37
+ "<pr_is_merged>": 128022,
38
+ "<pr_review>": 128030,
39
+ "<pr_review_comment>": 128032,
40
+ "<pr_review_state>": 128031,
41
+ "<pr_status>": 128021,
42
+ "<repo_name>": 128007,
43
+ "<think>": 128040,
44
+ "<tool_call>": 128042,
45
+ "<tool_response>": 128048,
46
+ "<tools>": 128050,
47
+ "<|IMAGE_PAD|>": 128060,
48
+ "<|VIDEO_PAD|>": 128061,
49
+ "<|_placeholder_067|>": 128067,
50
+ "<|_placeholder_068|>": 128068,
51
+ "<|_placeholder_069|>": 128069,
52
+ "<|_placeholder_070|>": 128070,
53
+ "<|_placeholder_071|>": 128071,
54
+ "<|_placeholder_072|>": 128072,
55
+ "<|_placeholder_073|>": 128073,
56
+ "<|_placeholder_074|>": 128074,
57
+ "<|_placeholder_075|>": 128075,
58
+ "<|_placeholder_076|>": 128076,
59
+ "<|_placeholder_077|>": 128077,
60
+ "<|_placeholder_078|>": 128078,
61
+ "<|_placeholder_079|>": 128079,
62
+ "<|_placeholder_080|>": 128080,
63
+ "<|_placeholder_081|>": 128081,
64
+ "<|_placeholder_082|>": 128082,
65
+ "<|_placeholder_083|>": 128083,
66
+ "<|_placeholder_084|>": 128084,
67
+ "<|_placeholder_085|>": 128085,
68
+ "<|_placeholder_086|>": 128086,
69
+ "<|_placeholder_087|>": 128087,
70
+ "<|_placeholder_088|>": 128088,
71
+ "<|_placeholder_089|>": 128089,
72
+ "<|_placeholder_090|>": 128090,
73
+ "<|_placeholder_091|>": 128091,
74
+ "<|_placeholder_092|>": 128092,
75
+ "<|_placeholder_093|>": 128093,
76
+ "<|_placeholder_094|>": 128094,
77
+ "<|_placeholder_095|>": 128095,
78
+ "<|_placeholder_096|>": 128096,
79
+ "<|_placeholder_097|>": 128097,
80
+ "<|_placeholder_098|>": 128098,
81
+ "<|_placeholder_099|>": 128099,
82
+ "<|_placeholder_100|>": 128100,
83
+ "<|_placeholder_101|>": 128101,
84
+ "<|_placeholder_102|>": 128102,
85
+ "<|_placeholder_103|>": 128103,
86
+ "<|_placeholder_104|>": 128104,
87
+ "<|_placeholder_105|>": 128105,
88
+ "<|_placeholder_106|>": 128106,
89
+ "<|_placeholder_107|>": 128107,
90
+ "<|_placeholder_108|>": 128108,
91
+ "<|_placeholder_109|>": 128109,
92
+ "<|_placeholder_110|>": 128110,
93
+ "<|_placeholder_111|>": 128111,
94
+ "<|_placeholder_112|>": 128112,
95
+ "<|_placeholder_113|>": 128113,
96
+ "<|_placeholder_114|>": 128114,
97
+ "<|_placeholder_115|>": 128115,
98
+ "<|_placeholder_116|>": 128116,
99
+ "<|_placeholder_117|>": 128117,
100
+ "<|_placeholder_118|>": 128118,
101
+ "<|_placeholder_119|>": 128119,
102
+ "<|_placeholder_120|>": 128120,
103
+ "<|_placeholder_121|>": 128121,
104
+ "<|_placeholder_122|>": 128122,
105
+ "<|_placeholder_123|>": 128123,
106
+ "<|_placeholder_124|>": 128124,
107
+ "<|_placeholder_125|>": 128125,
108
+ "<|_placeholder_126|>": 128126,
109
+ "<|_placeholder_127|>": 128127,
110
+ "<|_placeholder_128|>": 128128,
111
+ "<|_placeholder_129|>": 128129,
112
+ "<|_placeholder_130|>": 128130,
113
+ "<|_placeholder_131|>": 128131,
114
+ "<|_placeholder_132|>": 128132,
115
+ "<|_placeholder_133|>": 128133,
116
+ "<|_placeholder_134|>": 128134,
117
+ "<|_placeholder_135|>": 128135,
118
+ "<|_placeholder_136|>": 128136,
119
+ "<|_placeholder_137|>": 128137,
120
+ "<|_placeholder_138|>": 128138,
121
+ "<|_placeholder_139|>": 128139,
122
+ "<|_placeholder_140|>": 128140,
123
+ "<|_placeholder_141|>": 128141,
124
+ "<|_placeholder_142|>": 128142,
125
+ "<|_placeholder_143|>": 128143,
126
+ "<|_placeholder_144|>": 128144,
127
+ "<|_placeholder_145|>": 128145,
128
+ "<|_placeholder_146|>": 128146,
129
+ "<|_placeholder_147|>": 128147,
130
+ "<|_placeholder_148|>": 128148,
131
+ "<|_placeholder_149|>": 128149,
132
+ "<|_placeholder_150|>": 128150,
133
+ "<|_placeholder_151|>": 128151,
134
+ "<|_placeholder_152|>": 128152,
135
+ "<|_placeholder_153|>": 128153,
136
+ "<|_placeholder_154|>": 128154,
137
+ "<|_placeholder_155|>": 128155,
138
+ "<|_placeholder_156|>": 128156,
139
+ "<|_placeholder_157|>": 128157,
140
+ "<|_placeholder_158|>": 128158,
141
+ "<|_placeholder_159|>": 128159,
142
+ "<|_placeholder_160|>": 128160,
143
+ "<|_placeholder_161|>": 128161,
144
+ "<|_placeholder_162|>": 128162,
145
+ "<|_placeholder_163|>": 128163,
146
+ "<|_placeholder_164|>": 128164,
147
+ "<|_placeholder_165|>": 128165,
148
+ "<|_placeholder_166|>": 128166,
149
+ "<|_placeholder_167|>": 128167,
150
+ "<|_placeholder_168|>": 128168,
151
+ "<|_placeholder_169|>": 128169,
152
+ "<|_placeholder_170|>": 128170,
153
+ "<|_placeholder_171|>": 128171,
154
+ "<|_placeholder_172|>": 128172,
155
+ "<|_placeholder_173|>": 128173,
156
+ "<|_placeholder_174|>": 128174,
157
+ "<|_placeholder_175|>": 128175,
158
+ "<|_placeholder_176|>": 128176,
159
+ "<|_placeholder_177|>": 128177,
160
+ "<|_placeholder_178|>": 128178,
161
+ "<|_placeholder_179|>": 128179,
162
+ "<|_placeholder_180|>": 128180,
163
+ "<|_placeholder_181|>": 128181,
164
+ "<|_placeholder_182|>": 128182,
165
+ "<|_placeholder_183|>": 128183,
166
+ "<|_placeholder_184|>": 128184,
167
+ "<|_placeholder_185|>": 128185,
168
+ "<|_placeholder_186|>": 128186,
169
+ "<|_placeholder_187|>": 128187,
170
+ "<|_placeholder_188|>": 128188,
171
+ "<|_placeholder_189|>": 128189,
172
+ "<|_placeholder_190|>": 128190,
173
+ "<|_placeholder_191|>": 128191,
174
+ "<|_placeholder_192|>": 128192,
175
+ "<|_placeholder_193|>": 128193,
176
+ "<|_placeholder_194|>": 128194,
177
+ "<|_placeholder_195|>": 128195,
178
+ "<|_placeholder_196|>": 128196,
179
+ "<|_placeholder_197|>": 128197,
180
+ "<|_placeholder_198|>": 128198,
181
+ "<|_placeholder_199|>": 128199,
182
+ "<|_placeholder_200|>": 128200,
183
+ "<|_placeholder_201|>": 128201,
184
+ "<|_placeholder_202|>": 128202,
185
+ "<|_placeholder_203|>": 128203,
186
+ "<|_placeholder_204|>": 128204,
187
+ "<|_placeholder_205|>": 128205,
188
+ "<|_placeholder_206|>": 128206,
189
+ "<|_placeholder_207|>": 128207,
190
+ "<|_placeholder_208|>": 128208,
191
+ "<|_placeholder_209|>": 128209,
192
+ "<|_placeholder_210|>": 128210,
193
+ "<|_placeholder_211|>": 128211,
194
+ "<|_placeholder_212|>": 128212,
195
+ "<|_placeholder_213|>": 128213,
196
+ "<|_placeholder_214|>": 128214,
197
+ "<|_placeholder_215|>": 128215,
198
+ "<|_placeholder_216|>": 128216,
199
+ "<|_placeholder_217|>": 128217,
200
+ "<|_placeholder_218|>": 128218,
201
+ "<|_placeholder_219|>": 128219,
202
+ "<|_placeholder_220|>": 128220,
203
+ "<|_placeholder_221|>": 128221,
204
+ "<|_placeholder_222|>": 128222,
205
+ "<|_placeholder_223|>": 128223,
206
+ "<|_placeholder_224|>": 128224,
207
+ "<|_placeholder_225|>": 128225,
208
+ "<|_placeholder_226|>": 128226,
209
+ "<|_placeholder_227|>": 128227,
210
+ "<|_placeholder_228|>": 128228,
211
+ "<|_placeholder_229|>": 128229,
212
+ "<|_placeholder_230|>": 128230,
213
+ "<|_placeholder_231|>": 128231,
214
+ "<|_placeholder_232|>": 128232,
215
+ "<|_placeholder_233|>": 128233,
216
+ "<|_placeholder_234|>": 128234,
217
+ "<|_placeholder_235|>": 128235,
218
+ "<|_placeholder_236|>": 128236,
219
+ "<|_placeholder_237|>": 128237,
220
+ "<|_placeholder_238|>": 128238,
221
+ "<|_placeholder_239|>": 128239,
222
+ "<|_placeholder_240|>": 128240,
223
+ "<|_placeholder_241|>": 128241,
224
+ "<|_placeholder_242|>": 128242,
225
+ "<|_placeholder_243|>": 128243,
226
+ "<|_placeholder_244|>": 128244,
227
+ "<|_placeholder_245|>": 128245,
228
+ "<|_placeholder_246|>": 128246,
229
+ "<|_placeholder_247|>": 128247,
230
+ "<|_placeholder_248|>": 128248,
231
+ "<|_placeholder_249|>": 128249,
232
+ "<|_placeholder_250|>": 128250,
233
+ "<|_placeholder_251|>": 128251,
234
+ "<|_placeholder_252|>": 128252,
235
+ "<|_placeholder_253|>": 128253,
236
+ "<|_placeholder_254|>": 128254,
237
+ "<|_placeholder_255|>": 128255,
238
+ "<|back_translation|>": 128065,
239
+ "<|code_switching|>": 128064,
240
+ "<|document_end|>": 128055,
241
+ "<|document_start|>": 128054,
242
+ "<|endofturn|>": 128003,
243
+ "<|fim_middle|>": 128005,
244
+ "<|fim_prefix|>": 128004,
245
+ "<|fim_suffix|>": 128006,
246
+ "<|im_end|>": 128001,
247
+ "<|im_start|>": 128000,
248
+ "<|image_end|>": 128057,
249
+ "<|image_start|>": 128056,
250
+ "<|instruction_pretraining|>": 128066,
251
+ "<|mime_end|>": 128053,
252
+ "<|mime_start|>": 128052,
253
+ "<|stop|>": 128002,
254
+ "<|video_end|>": 128059,
255
+ "<|video_start|>": 128058,
256
+ "<|vision_aux_end|>": 128063,
257
+ "<|vision_aux_start|>": 128062
258
+ }
chat_template.jinja ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set ns_img = namespace(count=0) %}
2
+ {%- set ns_vid = namespace(count=0) %}
3
+ {%- if tools %}
4
+ {{- '<|im_start|>system\n' }}
5
+ {%- if messages[0].role == 'system' and messages[0].content is string %}
6
+ {{- messages[0].content + '\n\n' }}
7
+ {%- endif %}
8
+ {{- '# Tools\n\n' }}
9
+ {{- 'You may call one or more functions to assist with the user query.\n\n' }}
10
+ {{- 'You are provided with function signatures within <tools></tools> XML tags:\n' }}
11
+ {{- '<tools>\n' }}
12
+ {%- for tool in tools %}
13
+ {{- tool | tojson(ensure_ascii=False) }}
14
+ {%- endfor %}
15
+ {{- '\n</tools>\n\n' }}
16
+ {{- 'For each function call, output the function name and arguments within the following XML format:\n' }}
17
+ {{- '<tool_call>{function-name}\n' }}
18
+ {{- '<arg_key>{arg-key-1}</arg_key>\n' }}
19
+ {{- '<arg_value>{arg-value-1}</arg_value>\n' }}
20
+ {{- '<arg_key>{arg-key-2}</arg_key>\n' }}
21
+ {{- '<arg_value>{arg-value-2}</arg_value>\n' }}
22
+ {{- '...\n' }}
23
+ {{- '</tool_call><|im_end|>\n' }}
24
+ {%- else %}
25
+ {%- if messages[0].role == 'system' %}
26
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
27
+ {%- endif %}
28
+ {%- endif %}
29
+ {%- set ns = namespace(last_user_index=-1) %}
30
+ {%- for m in messages %}
31
+ {%- if m.role == 'user' %}
32
+ {%- set ns.last_user_index = loop.index0 %}
33
+ {%- endif %}
34
+ {%- endfor %}
35
+ {%- for message in messages %}
36
+ {%- set content = message.get('content', '') or '' %}
37
+ {%- if (message.role == 'system' and not loop.first) %}
38
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>\n' }}
39
+ {%- elif message.role == 'user' %}
40
+ {{- '<|im_start|>user\n' }}
41
+ {%- if message['content'] is string %}
42
+ {{- message['content'] + '<|im_end|>\n' }}
43
+ {%- elif message['content'] is sequence %}
44
+ {%- for content in message['content'] %}
45
+ {%- if not loop.first %}
46
+ {{- '\n' }}
47
+ {%- endif %}
48
+ {%- if content['type'] == 'image_url' %}
49
+ {%- set media_url = content.get('image_url', {}).get('url', '') %}
50
+ {%- set url_lower = media_url.lower() %}
51
+ {%- set video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.flv', '.wmv', '.m4v'] %}
52
+ {%- set ns_check = namespace(is_video=False) %}
53
+ {%- for ext in video_extensions %}
54
+ {%- if url_lower.endswith(ext) %}
55
+ {%- set ns_check.is_video = True %}
56
+ {%- endif %}
57
+ {%- endfor %}
58
+ {%- if ns_check.is_video %}
59
+ {%- set video_id = 'video_%02d' % ns_vid.count %}
60
+ {%- set ns_vid.count = ns_vid.count + 1 %}
61
+ {%- set filename = media_url.split('/')[-1] %}
62
+ {{- '<|mime_start|>{"id": "' + video_id + '", "type": "video/mp4", "filename": "' + filename + '"}<|mime_end|>\n' }}
63
+ {{- '<|video_aux_start|>다음 중 video_duration은 비디오 길이 정보입니다. 참고하여 답변하세요. {"video_duration": "<|video_meta_duration|>"}<|video_aux_end|>\n'}}
64
+ {{- '<|video_start|><|VIDEO_PAD|><|video_end|>\n'}}
65
+ {%- else %}
66
+ {%- set image_id = 'image_%02d' % ns_img.count %}
67
+ {%- set ns_img.count = ns_img.count + 1 %}
68
+ {%- set filename = media_url.split('/')[-1] %}
69
+ {{- '<|mime_start|>{"id": "' + image_id + '", "type": "image/jpeg", "filename": "' + filename + '"}<|mime_end|>\n' }}
70
+ {{- '<|image_start|><|IMAGE_PAD|><|image_end|>' }}
71
+ {%- endif %}
72
+ {%- elif content['type'] == 'text' %}
73
+ {{- content['text'] }}
74
+ {%- endif %}
75
+ {%- endfor %}
76
+ {{- '<|im_end|>\n'}}
77
+ {%- endif %}
78
+ {%- elif message.role == 'assistant' %}
79
+ {%- set reasoning_content = '' %}
80
+ {%- if message.get('reasoning_content') is string %}
81
+ {%- set reasoning_content = message.get('reasoning_content') %}
82
+ {%- else %}
83
+ {%- if '</think>' in content %}
84
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
85
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
86
+ {%- endif %}
87
+ {%- endif %}
88
+ {%- if loop.index0 > ns.last_user_index %}
89
+ {%- if loop.last or reasoning_content %}
90
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
91
+ {%- else %}
92
+ {{- '<|im_start|>' + message.role + '\n' + content }}
93
+ {%- endif %}
94
+ {%- else %}
95
+ {{- '<|im_start|>' + message.role + '\n' + content }}
96
+ {%- endif %}
97
+ {%- if message.get('tool_calls') %}
98
+ {%- for tool_call in message.get('tool_calls', []) %}
99
+ {%- if not loop.first or content %}
100
+ {{- '\n' }}
101
+ {%- endif %}
102
+ {%- if tool_call.get('function') %}
103
+ {%- set tool_call = tool_call.get('function') %}
104
+ {%- endif %}
105
+ {{- '<tool_call>' + tool_call.get('name', '') + '\n' }}
106
+ {%- set _args = tool_call.get('arguments', {}) %}
107
+ {%- if _args is string %}
108
+ {{- '<arguments>' + _args + '</arguments>\n' }}
109
+ {%- elif _args is mapping %}
110
+ {%- for k, v in _args.items() %}
111
+ {{- '<arg_key>' + k + '</arg_key>\n' }}
112
+ {{- '<arg_value>' + (v | tojson(ensure_ascii=False) if v is not string else v) + '</arg_value>\n' }}
113
+ {%- endfor %}
114
+ {%- endif %}
115
+ {{- '</tool_call>' }}
116
+ {%- endfor %}
117
+ {%- endif %}
118
+ {{- '<|im_end|>\n' }}
119
+ {%- elif message.role == 'tool' %}
120
+ {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}
121
+ {{- '<|im_start|>tool' }}
122
+ {%- endif %}
123
+ {{- '\n<tool_response>' + message.get('name', '') + '\n' }}
124
+ {{- content }}
125
+ {{- '\n</tool_response>' }}
126
+ {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}
127
+ {{- '<|im_end|>\n' }}
128
+ {%- endif %}
129
+ {%- endif %}
130
+ {%- endfor %}
131
+ {%- if add_generation_prompt %}
132
+ {%- if thinking is defined and thinking is true %}
133
+ {{- '<|im_start|>assistant\n<think>\n' }}
134
+ {%- else %}
135
+ {{- '<|im_start|>assistant\n<think>\n\n</think>\n\n' }}
136
+ {%- endif %}
137
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "HyperCLOVAXForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attention_multiplier": 0.08838834764831845,
9
+ "auto_map": {
10
+ "AutoConfig": "configuration_hyperclovax.HyperCLOVAXConfig",
11
+ "AutoModel": "modeling_hyperclovax.HyperCLOVAXModel",
12
+ "AutoModelForCausalLM": "modeling_hyperclovax.HyperCLOVAXForCausalLM"
13
+ },
14
+ "bad_words_ids": null,
15
+ "begin_suppress_tokens": null,
16
+ "bos_token_id": 128000,
17
+ "chunk_size_feed_forward": 0,
18
+ "cross_attention_hidden_size": null,
19
+ "decoder_start_token_id": null,
20
+ "diversity_penalty": 0.0,
21
+ "do_sample": false,
22
+ "dtype": "bfloat16",
23
+ "early_stopping": false,
24
+ "embedding_multiplier": 1.0,
25
+ "encoder_no_repeat_ngram_size": 0,
26
+ "end_token_id": 128001,
27
+ "eos_token_id": 128001,
28
+ "exponential_decay_length_penalty": null,
29
+ "finetuning_task": null,
30
+ "forced_bos_token_id": null,
31
+ "forced_eos_token_id": null,
32
+ "head_dim": 128,
33
+ "hidden_act": "silu",
34
+ "hidden_size": 5120,
35
+ "id2label": {
36
+ "0": "LABEL_0",
37
+ "1": "LABEL_1"
38
+ },
39
+ "initializer_range": 0.006,
40
+ "intermediate_size": 24192,
41
+ "is_decoder": false,
42
+ "is_encoder_decoder": false,
43
+ "label2id": {
44
+ "LABEL_0": 0,
45
+ "LABEL_1": 1
46
+ },
47
+ "length_penalty": 1.0,
48
+ "logits_scaling": 1.0,
49
+ "max_length": 20,
50
+ "max_position_embeddings": 131072,
51
+ "min_length": 0,
52
+ "mlp_bias": false,
53
+ "model_type": "hyperclovax",
54
+ "no_repeat_ngram_size": 0,
55
+ "num_attention_heads": 40,
56
+ "num_beam_groups": 1,
57
+ "num_beams": 1,
58
+ "num_hidden_layers": 72,
59
+ "num_key_value_heads": 8,
60
+ "num_return_sequences": 1,
61
+ "output_attentions": false,
62
+ "output_hidden_states": false,
63
+ "output_scores": false,
64
+ "pad_token_id": 0,
65
+ "prefix": null,
66
+ "pretraining_tp": 1,
67
+ "problem_type": null,
68
+ "pruned_heads": {},
69
+ "remove_invalid_values": false,
70
+ "repetition_penalty": 1.0,
71
+ "resid_pdrop": 0.2,
72
+ "residual_multiplier": 1.0,
73
+ "return_dict": true,
74
+ "return_dict_in_generate": false,
75
+ "rms_norm_eps": 1e-05,
76
+ "rope_scaling": null,
77
+ "rope_theta": 50000000,
78
+ "sep_token_id": null,
79
+ "suppress_tokens": null,
80
+ "task_specific_params": null,
81
+ "temperature": 1.0,
82
+ "tf_legacy_loss": false,
83
+ "tie_encoder_decoder": false,
84
+ "tie_word_embeddings": false,
85
+ "tokenizer_class": null,
86
+ "top_k": 50,
87
+ "top_p": 1.0,
88
+ "torch_dtype": "bfloat16",
89
+ "torchscript": false,
90
+ "typical_p": 1.0,
91
+ "use_bfloat16": false,
92
+ "use_cache": false,
93
+ "use_post_norm": false,
94
+ "vocab_size": 128256,
95
+ "_name_or_path": "/home/brian/models/HyperCLOVAX-SEED-Text-32B",
96
+ "transformers_version": "4.52.4"
97
+ }
configuration_hyperclovax.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """LLaMA model configuration"""
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+
24
+ # from transformers.modeling_rope_utils import rope_config_validation
25
+ # from transformers import PretrainedConfig, rope_config_validation
26
+
27
+
28
+ class HyperCLOVAXConfig(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
31
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
32
+ defaults will yield a similar configuration to that of the LLaMA-7B.
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+
38
+ Args:
39
+ vocab_size (`int`, *optional*, defaults to 32000):
40
+ Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
41
+ `inputs_ids` passed when calling [`LlamaModel`]
42
+ hidden_size (`int`, *optional*, defaults to 4096):
43
+ Dimension of the hidden representations.
44
+ intermediate_size (`int`, *optional*, defaults to 11008):
45
+ Dimension of the MLP representations.
46
+ num_hidden_layers (`int`, *optional*, defaults to 32):
47
+ Number of hidden layers in the Transformer decoder.
48
+ num_attention_heads (`int`, *optional*, defaults to 32):
49
+ Number of attention heads for each attention layer in the Transformer decoder.
50
+ num_key_value_heads (`int`, *optional*):
51
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55
+ by meanpooling all the original heads within that group. For more details checkout [this
56
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
57
+ `num_attention_heads`.
58
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
+ The non-linear activation function (function or string) in the decoder.
60
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
61
+ The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
62
+ Llama 2 up to 4096, CodeLlama up to 16384.
63
+ initializer_range (`float`, *optional*, defaults to 0.02):
64
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
65
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
66
+ The epsilon used by the rms normalization layers.
67
+ use_cache (`bool`, *optional*, defaults to `True`):
68
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
69
+ relevant if `config.is_decoder=True`.
70
+ pad_token_id (`int`, *optional*):
71
+ Padding token id.
72
+ bos_token_id (`int`, *optional*, defaults to 1):
73
+ Beginning of stream token id.
74
+ eos_token_id (`int`, *optional*, defaults to 2):
75
+ End of stream token id.
76
+ pretraining_tp (`int`, *optional*, defaults to 1):
77
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
78
+ document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
79
+ understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
80
+ results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
81
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
82
+ Whether to tie weight embeddings
83
+ rope_theta (`float`, *optional*, defaults to 10000.0):
84
+ The base period of the RoPE embeddings.
85
+ rope_scaling (`Dict`, *optional*):
86
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
87
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
88
+ accordingly.
89
+ Expected contents:
90
+ `rope_type` (`str`):
91
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
92
+ 'llama3'], with 'default' being the original RoPE implementation.
93
+ `factor` (`float`, *optional*):
94
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
95
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
96
+ original maximum pre-trained length.
97
+ `original_max_position_embeddings` (`int`, *optional*):
98
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
99
+ pretraining.
100
+ `attention_factor` (`float`, *optional*):
101
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
102
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
103
+ `factor` field to infer the suggested value.
104
+ `beta_fast` (`float`, *optional*):
105
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
106
+ ramp function. If unspecified, it defaults to 32.
107
+ `beta_slow` (`float`, *optional*):
108
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
109
+ ramp function. If unspecified, it defaults to 1.
110
+ `short_factor` (`List[float]`, *optional*):
111
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
112
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
113
+ size divided by the number of attention heads divided by 2
114
+ `long_factor` (`List[float]`, *optional*):
115
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
116
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
117
+ size divided by the number of attention heads divided by 2
118
+ `low_freq_factor` (`float`, *optional*):
119
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
120
+ `high_freq_factor` (`float`, *optional*):
121
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
122
+ attention_bias (`bool`, *optional*, defaults to `False`):
123
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
124
+ attention_dropout (`float`, *optional*, defaults to 0.0):
125
+ The dropout ratio for the attention probabilities.
126
+ mlp_bias (`bool`, *optional*, defaults to `False`):
127
+ Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
128
+ head_dim (`int`, *optional*):
129
+ The attention head dimension. If None, it will default to hidden_size // num_heads
130
+
131
+ ```python
132
+ >>> from transformers import LlamaModel, LlamaConfig
133
+
134
+ >>> # Initializing a LLaMA llama-7b style configuration
135
+ >>> configuration = LlamaConfig()
136
+
137
+ >>> # Initializing a model from the llama-7b style configuration
138
+ >>> model = LlamaModel(configuration)
139
+
140
+ >>> # Accessing the model configuration
141
+ >>> configuration = model.config
142
+ ```"""
143
+
144
+ model_type = "hyperclovax"
145
+ keys_to_ignore_at_inference = ["past_key_values"]
146
+
147
+ def __init__(
148
+ self,
149
+ vocab_size=32000,
150
+ hidden_size=4096,
151
+ intermediate_size=11008,
152
+ num_hidden_layers=32,
153
+ num_attention_heads=32,
154
+ num_key_value_heads=None,
155
+ hidden_act="silu",
156
+ max_position_embeddings=2048,
157
+ initializer_range=0.02,
158
+ rms_norm_eps=1e-6,
159
+ use_cache=True,
160
+ pad_token_id=None,
161
+ bos_token_id=1,
162
+ eos_token_id=2,
163
+ pretraining_tp=1,
164
+ tie_word_embeddings=False,
165
+ rope_theta=10000.0,
166
+ rope_scaling=None,
167
+ attention_bias=False,
168
+ attention_dropout=0.0,
169
+ mlp_bias=False,
170
+ head_dim=None,
171
+ embedding_multiplier=1.0, # mup
172
+ logits_scaling=1.0, # mup
173
+ attention_multiplier=1.0, # mup
174
+ residual_multiplier=1.0, # mup
175
+ use_post_norm=False, # post-norm
176
+ auto_map={
177
+ "AutoConfig": "configuration_hyperclovax.HyperCLOVAXConfig",
178
+ "AutoModel": "modeling_hyperclovax.HyperCLOVAXModel",
179
+ "AutoModelForCausalLM": "modeling_hyperclovax.HyperCLOVAXForCausalLM",
180
+ },
181
+ **kwargs,
182
+ ):
183
+ self.vocab_size = vocab_size
184
+ self.max_position_embeddings = max_position_embeddings
185
+ self.hidden_size = hidden_size
186
+ self.intermediate_size = intermediate_size
187
+ self.num_hidden_layers = num_hidden_layers
188
+ self.num_attention_heads = num_attention_heads
189
+
190
+ # for backward compatibility
191
+ if num_key_value_heads is None:
192
+ num_key_value_heads = num_attention_heads
193
+
194
+ self.num_key_value_heads = num_key_value_heads
195
+ self.hidden_act = hidden_act
196
+ self.initializer_range = initializer_range
197
+ self.rms_norm_eps = rms_norm_eps
198
+ self.pretraining_tp = pretraining_tp
199
+ self.use_cache = use_cache
200
+ self.rope_theta = rope_theta
201
+ self.rope_scaling = rope_scaling
202
+ self.attention_bias = attention_bias
203
+ self.attention_dropout = attention_dropout
204
+ self.mlp_bias = mlp_bias
205
+ self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
206
+ # Validate the correctness of rotary position embeddings parameters
207
+ # BC: if there is a 'type' field, copy it it to 'rope_type'.
208
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
209
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
210
+ # rope_config_validation(self)
211
+
212
+ # mup
213
+ self.embedding_multiplier = embedding_multiplier
214
+ self.logits_scaling = logits_scaling
215
+ self.attention_multiplier = attention_multiplier
216
+ self.residual_multiplier = residual_multiplier
217
+
218
+ # post-norm (dual-norm)
219
+ self.use_post_norm = use_post_norm
220
+
221
+ super().__init__(
222
+ pad_token_id=pad_token_id,
223
+ bos_token_id=bos_token_id,
224
+ eos_token_id=eos_token_id,
225
+ tie_word_embeddings=tie_word_embeddings,
226
+ auto_map=auto_map,
227
+ **kwargs,
228
+ )
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "eos_token_id": 128001,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.52.4",
7
+ "use_cache": false
8
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dcf1b621edecf413c0139db65dfa5cad76257b001a384f29a374310122dcd36
3
+ size 5233780272
model-00002-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cd62521eb383eab1b2813e17ef12d1071efe3d8cd7483abf4979d9bff807bac
3
+ size 5214173656
model-00003-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e47583c64edc9929cddf93d4a77b50962c66d56798864943203ba7053d1b1fb4
3
+ size 5214173648
model-00004-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:112efea40c23f556021bcc754e872f90f7152cfc7acc7fa959517652921d6870
3
+ size 5214173656
model-00005-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c00b1534c97731baee3123b84b84dbaada195c307123c03623e64bdf6270e8
3
+ size 5214173648
model-00006-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:256420e9850f07f56b04f84889b80c1c8eebccd27d88edfbef7c603b6980d011
3
+ size 5214173656
model-00007-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a56e0c76182e7a728a543f86ada80fb4464c203940287a0f246720a891f16cb
3
+ size 5214173648
model-00008-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b662137975294b89c7396194e9e4fcb2590f1979d39c4b73a080af8985233d9
3
+ size 5214173656
model-00009-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e02b2da36096dc1ca7c1ec69e9697b81fcc756679760c816c1a4538811d0717
3
+ size 5214173648
model-00010-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85f91c706a802aa5430b705679df89c48acc859ce611def72c33e49de104d449
3
+ size 5214173648
model-00011-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca83c0d2409d124c35057b10fe05a56096cff81cb629f1153fd9227f88f952e8
3
+ size 5214173656
model-00012-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52fffd435f4fe5c2d4e6d925f04826462fb72344d2732dfc67734a444901f0b6
3
+ size 5214173648
model-00013-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38c6d54bd1ee971ded9d74ea0662de63f05c36df20c6e84ad93d4ebe35babca8
3
+ size 2607086784
model.safetensors.index.json ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 65196697600
4
+ },
5
+ "weight_map": {
6
+ "model.lm_head.weight": "model-00001-of-00013.safetensors",
7
+ "model.model.embed_tokens.weight": "model-00001-of-00013.safetensors",
8
+ "model.model.layers.0.input_layernorm.weight": "model-00001-of-00013.safetensors",
9
+ "model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00013.safetensors",
10
+ "model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00013.safetensors",
11
+ "model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00013.safetensors",
12
+ "model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00013.safetensors",
13
+ "model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00013.safetensors",
14
+ "model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00013.safetensors",
15
+ "model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00013.safetensors",
16
+ "model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00013.safetensors",
17
+ "model.model.layers.1.input_layernorm.weight": "model-00001-of-00013.safetensors",
18
+ "model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00013.safetensors",
19
+ "model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00013.safetensors",
20
+ "model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00013.safetensors",
21
+ "model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00013.safetensors",
22
+ "model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00013.safetensors",
23
+ "model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00013.safetensors",
24
+ "model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00013.safetensors",
25
+ "model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00013.safetensors",
26
+ "model.model.layers.10.input_layernorm.weight": "model-00001-of-00013.safetensors",
27
+ "model.model.layers.10.mlp.down_proj.weight": "model-00001-of-00013.safetensors",
28
+ "model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00013.safetensors",
29
+ "model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00013.safetensors",
30
+ "model.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00013.safetensors",
31
+ "model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00013.safetensors",
32
+ "model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00013.safetensors",
33
+ "model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00013.safetensors",
34
+ "model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00013.safetensors",
35
+ "model.model.layers.11.input_layernorm.weight": "model-00001-of-00013.safetensors",
36
+ "model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00013.safetensors",
37
+ "model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00013.safetensors",
38
+ "model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00013.safetensors",
39
+ "model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00013.safetensors",
40
+ "model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00013.safetensors",
41
+ "model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00013.safetensors",
42
+ "model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00013.safetensors",
43
+ "model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00013.safetensors",
44
+ "model.model.layers.12.input_layernorm.weight": "model-00002-of-00013.safetensors",
45
+ "model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00013.safetensors",
46
+ "model.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00013.safetensors",
47
+ "model.model.layers.12.mlp.up_proj.weight": "model-00002-of-00013.safetensors",
48
+ "model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00013.safetensors",
49
+ "model.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00013.safetensors",
50
+ "model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00013.safetensors",
51
+ "model.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00013.safetensors",
52
+ "model.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00013.safetensors",
53
+ "model.model.layers.13.input_layernorm.weight": "model-00002-of-00013.safetensors",
54
+ "model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00013.safetensors",
55
+ "model.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00013.safetensors",
56
+ "model.model.layers.13.mlp.up_proj.weight": "model-00002-of-00013.safetensors",
57
+ "model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00013.safetensors",
58
+ "model.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00013.safetensors",
59
+ "model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00013.safetensors",
60
+ "model.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00013.safetensors",
61
+ "model.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00013.safetensors",
62
+ "model.model.layers.14.input_layernorm.weight": "model-00002-of-00013.safetensors",
63
+ "model.model.layers.14.mlp.down_proj.weight": "model-00002-of-00013.safetensors",
64
+ "model.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00013.safetensors",
65
+ "model.model.layers.14.mlp.up_proj.weight": "model-00002-of-00013.safetensors",
66
+ "model.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00013.safetensors",
67
+ "model.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00013.safetensors",
68
+ "model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00013.safetensors",
69
+ "model.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00013.safetensors",
70
+ "model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00013.safetensors",
71
+ "model.model.layers.15.input_layernorm.weight": "model-00002-of-00013.safetensors",
72
+ "model.model.layers.15.mlp.down_proj.weight": "model-00002-of-00013.safetensors",
73
+ "model.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00013.safetensors",
74
+ "model.model.layers.15.mlp.up_proj.weight": "model-00002-of-00013.safetensors",
75
+ "model.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00013.safetensors",
76
+ "model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00013.safetensors",
77
+ "model.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00013.safetensors",
78
+ "model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00013.safetensors",
79
+ "model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00013.safetensors",
80
+ "model.model.layers.16.input_layernorm.weight": "model-00002-of-00013.safetensors",
81
+ "model.model.layers.16.mlp.down_proj.weight": "model-00002-of-00013.safetensors",
82
+ "model.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00013.safetensors",
83
+ "model.model.layers.16.mlp.up_proj.weight": "model-00002-of-00013.safetensors",
84
+ "model.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00013.safetensors",
85
+ "model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00013.safetensors",
86
+ "model.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00013.safetensors",
87
+ "model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00013.safetensors",
88
+ "model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00013.safetensors",
89
+ "model.model.layers.17.input_layernorm.weight": "model-00002-of-00013.safetensors",
90
+ "model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00013.safetensors",
91
+ "model.model.layers.17.mlp.gate_proj.weight": "model-00003-of-00013.safetensors",
92
+ "model.model.layers.17.mlp.up_proj.weight": "model-00003-of-00013.safetensors",
93
+ "model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00013.safetensors",
94
+ "model.model.layers.17.self_attn.k_proj.weight": "model-00003-of-00013.safetensors",
95
+ "model.model.layers.17.self_attn.o_proj.weight": "model-00003-of-00013.safetensors",
96
+ "model.model.layers.17.self_attn.q_proj.weight": "model-00003-of-00013.safetensors",
97
+ "model.model.layers.17.self_attn.v_proj.weight": "model-00003-of-00013.safetensors",
98
+ "model.model.layers.18.input_layernorm.weight": "model-00003-of-00013.safetensors",
99
+ "model.model.layers.18.mlp.down_proj.weight": "model-00003-of-00013.safetensors",
100
+ "model.model.layers.18.mlp.gate_proj.weight": "model-00003-of-00013.safetensors",
101
+ "model.model.layers.18.mlp.up_proj.weight": "model-00003-of-00013.safetensors",
102
+ "model.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00013.safetensors",
103
+ "model.model.layers.18.self_attn.k_proj.weight": "model-00003-of-00013.safetensors",
104
+ "model.model.layers.18.self_attn.o_proj.weight": "model-00003-of-00013.safetensors",
105
+ "model.model.layers.18.self_attn.q_proj.weight": "model-00003-of-00013.safetensors",
106
+ "model.model.layers.18.self_attn.v_proj.weight": "model-00003-of-00013.safetensors",
107
+ "model.model.layers.19.input_layernorm.weight": "model-00003-of-00013.safetensors",
108
+ "model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00013.safetensors",
109
+ "model.model.layers.19.mlp.gate_proj.weight": "model-00003-of-00013.safetensors",
110
+ "model.model.layers.19.mlp.up_proj.weight": "model-00003-of-00013.safetensors",
111
+ "model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00013.safetensors",
112
+ "model.model.layers.19.self_attn.k_proj.weight": "model-00003-of-00013.safetensors",
113
+ "model.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00013.safetensors",
114
+ "model.model.layers.19.self_attn.q_proj.weight": "model-00003-of-00013.safetensors",
115
+ "model.model.layers.19.self_attn.v_proj.weight": "model-00003-of-00013.safetensors",
116
+ "model.model.layers.2.input_layernorm.weight": "model-00003-of-00013.safetensors",
117
+ "model.model.layers.2.mlp.down_proj.weight": "model-00003-of-00013.safetensors",
118
+ "model.model.layers.2.mlp.gate_proj.weight": "model-00003-of-00013.safetensors",
119
+ "model.model.layers.2.mlp.up_proj.weight": "model-00003-of-00013.safetensors",
120
+ "model.model.layers.2.post_attention_layernorm.weight": "model-00003-of-00013.safetensors",
121
+ "model.model.layers.2.self_attn.k_proj.weight": "model-00003-of-00013.safetensors",
122
+ "model.model.layers.2.self_attn.o_proj.weight": "model-00003-of-00013.safetensors",
123
+ "model.model.layers.2.self_attn.q_proj.weight": "model-00003-of-00013.safetensors",
124
+ "model.model.layers.2.self_attn.v_proj.weight": "model-00003-of-00013.safetensors",
125
+ "model.model.layers.20.input_layernorm.weight": "model-00003-of-00013.safetensors",
126
+ "model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00013.safetensors",
127
+ "model.model.layers.20.mlp.gate_proj.weight": "model-00003-of-00013.safetensors",
128
+ "model.model.layers.20.mlp.up_proj.weight": "model-00003-of-00013.safetensors",
129
+ "model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00013.safetensors",
130
+ "model.model.layers.20.self_attn.k_proj.weight": "model-00003-of-00013.safetensors",
131
+ "model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00013.safetensors",
132
+ "model.model.layers.20.self_attn.q_proj.weight": "model-00003-of-00013.safetensors",
133
+ "model.model.layers.20.self_attn.v_proj.weight": "model-00003-of-00013.safetensors",
134
+ "model.model.layers.21.input_layernorm.weight": "model-00003-of-00013.safetensors",
135
+ "model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00013.safetensors",
136
+ "model.model.layers.21.mlp.gate_proj.weight": "model-00003-of-00013.safetensors",
137
+ "model.model.layers.21.mlp.up_proj.weight": "model-00003-of-00013.safetensors",
138
+ "model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00013.safetensors",
139
+ "model.model.layers.21.self_attn.k_proj.weight": "model-00003-of-00013.safetensors",
140
+ "model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00013.safetensors",
141
+ "model.model.layers.21.self_attn.q_proj.weight": "model-00003-of-00013.safetensors",
142
+ "model.model.layers.21.self_attn.v_proj.weight": "model-00003-of-00013.safetensors",
143
+ "model.model.layers.22.input_layernorm.weight": "model-00003-of-00013.safetensors",
144
+ "model.model.layers.22.mlp.down_proj.weight": "model-00004-of-00013.safetensors",
145
+ "model.model.layers.22.mlp.gate_proj.weight": "model-00004-of-00013.safetensors",
146
+ "model.model.layers.22.mlp.up_proj.weight": "model-00004-of-00013.safetensors",
147
+ "model.model.layers.22.post_attention_layernorm.weight": "model-00004-of-00013.safetensors",
148
+ "model.model.layers.22.self_attn.k_proj.weight": "model-00004-of-00013.safetensors",
149
+ "model.model.layers.22.self_attn.o_proj.weight": "model-00004-of-00013.safetensors",
150
+ "model.model.layers.22.self_attn.q_proj.weight": "model-00004-of-00013.safetensors",
151
+ "model.model.layers.22.self_attn.v_proj.weight": "model-00004-of-00013.safetensors",
152
+ "model.model.layers.23.input_layernorm.weight": "model-00004-of-00013.safetensors",
153
+ "model.model.layers.23.mlp.down_proj.weight": "model-00004-of-00013.safetensors",
154
+ "model.model.layers.23.mlp.gate_proj.weight": "model-00004-of-00013.safetensors",
155
+ "model.model.layers.23.mlp.up_proj.weight": "model-00004-of-00013.safetensors",
156
+ "model.model.layers.23.post_attention_layernorm.weight": "model-00004-of-00013.safetensors",
157
+ "model.model.layers.23.self_attn.k_proj.weight": "model-00004-of-00013.safetensors",
158
+ "model.model.layers.23.self_attn.o_proj.weight": "model-00004-of-00013.safetensors",
159
+ "model.model.layers.23.self_attn.q_proj.weight": "model-00004-of-00013.safetensors",
160
+ "model.model.layers.23.self_attn.v_proj.weight": "model-00004-of-00013.safetensors",
161
+ "model.model.layers.24.input_layernorm.weight": "model-00004-of-00013.safetensors",
162
+ "model.model.layers.24.mlp.down_proj.weight": "model-00004-of-00013.safetensors",
163
+ "model.model.layers.24.mlp.gate_proj.weight": "model-00004-of-00013.safetensors",
164
+ "model.model.layers.24.mlp.up_proj.weight": "model-00004-of-00013.safetensors",
165
+ "model.model.layers.24.post_attention_layernorm.weight": "model-00004-of-00013.safetensors",
166
+ "model.model.layers.24.self_attn.k_proj.weight": "model-00004-of-00013.safetensors",
167
+ "model.model.layers.24.self_attn.o_proj.weight": "model-00004-of-00013.safetensors",
168
+ "model.model.layers.24.self_attn.q_proj.weight": "model-00004-of-00013.safetensors",
169
+ "model.model.layers.24.self_attn.v_proj.weight": "model-00004-of-00013.safetensors",
170
+ "model.model.layers.25.input_layernorm.weight": "model-00004-of-00013.safetensors",
171
+ "model.model.layers.25.mlp.down_proj.weight": "model-00004-of-00013.safetensors",
172
+ "model.model.layers.25.mlp.gate_proj.weight": "model-00004-of-00013.safetensors",
173
+ "model.model.layers.25.mlp.up_proj.weight": "model-00004-of-00013.safetensors",
174
+ "model.model.layers.25.post_attention_layernorm.weight": "model-00004-of-00013.safetensors",
175
+ "model.model.layers.25.self_attn.k_proj.weight": "model-00004-of-00013.safetensors",
176
+ "model.model.layers.25.self_attn.o_proj.weight": "model-00004-of-00013.safetensors",
177
+ "model.model.layers.25.self_attn.q_proj.weight": "model-00004-of-00013.safetensors",
178
+ "model.model.layers.25.self_attn.v_proj.weight": "model-00004-of-00013.safetensors",
179
+ "model.model.layers.26.input_layernorm.weight": "model-00004-of-00013.safetensors",
180
+ "model.model.layers.26.mlp.down_proj.weight": "model-00004-of-00013.safetensors",
181
+ "model.model.layers.26.mlp.gate_proj.weight": "model-00004-of-00013.safetensors",
182
+ "model.model.layers.26.mlp.up_proj.weight": "model-00004-of-00013.safetensors",
183
+ "model.model.layers.26.post_attention_layernorm.weight": "model-00004-of-00013.safetensors",
184
+ "model.model.layers.26.self_attn.k_proj.weight": "model-00004-of-00013.safetensors",
185
+ "model.model.layers.26.self_attn.o_proj.weight": "model-00004-of-00013.safetensors",
186
+ "model.model.layers.26.self_attn.q_proj.weight": "model-00004-of-00013.safetensors",
187
+ "model.model.layers.26.self_attn.v_proj.weight": "model-00004-of-00013.safetensors",
188
+ "model.model.layers.27.input_layernorm.weight": "model-00004-of-00013.safetensors",
189
+ "model.model.layers.27.mlp.down_proj.weight": "model-00004-of-00013.safetensors",
190
+ "model.model.layers.27.mlp.gate_proj.weight": "model-00004-of-00013.safetensors",
191
+ "model.model.layers.27.mlp.up_proj.weight": "model-00004-of-00013.safetensors",
192
+ "model.model.layers.27.post_attention_layernorm.weight": "model-00004-of-00013.safetensors",
193
+ "model.model.layers.27.self_attn.k_proj.weight": "model-00004-of-00013.safetensors",
194
+ "model.model.layers.27.self_attn.o_proj.weight": "model-00004-of-00013.safetensors",
195
+ "model.model.layers.27.self_attn.q_proj.weight": "model-00004-of-00013.safetensors",
196
+ "model.model.layers.27.self_attn.v_proj.weight": "model-00004-of-00013.safetensors",
197
+ "model.model.layers.28.input_layernorm.weight": "model-00004-of-00013.safetensors",
198
+ "model.model.layers.28.mlp.down_proj.weight": "model-00005-of-00013.safetensors",
199
+ "model.model.layers.28.mlp.gate_proj.weight": "model-00005-of-00013.safetensors",
200
+ "model.model.layers.28.mlp.up_proj.weight": "model-00005-of-00013.safetensors",
201
+ "model.model.layers.28.post_attention_layernorm.weight": "model-00005-of-00013.safetensors",
202
+ "model.model.layers.28.self_attn.k_proj.weight": "model-00005-of-00013.safetensors",
203
+ "model.model.layers.28.self_attn.o_proj.weight": "model-00005-of-00013.safetensors",
204
+ "model.model.layers.28.self_attn.q_proj.weight": "model-00005-of-00013.safetensors",
205
+ "model.model.layers.28.self_attn.v_proj.weight": "model-00005-of-00013.safetensors",
206
+ "model.model.layers.29.input_layernorm.weight": "model-00005-of-00013.safetensors",
207
+ "model.model.layers.29.mlp.down_proj.weight": "model-00005-of-00013.safetensors",
208
+ "model.model.layers.29.mlp.gate_proj.weight": "model-00005-of-00013.safetensors",
209
+ "model.model.layers.29.mlp.up_proj.weight": "model-00005-of-00013.safetensors",
210
+ "model.model.layers.29.post_attention_layernorm.weight": "model-00005-of-00013.safetensors",
211
+ "model.model.layers.29.self_attn.k_proj.weight": "model-00005-of-00013.safetensors",
212
+ "model.model.layers.29.self_attn.o_proj.weight": "model-00005-of-00013.safetensors",
213
+ "model.model.layers.29.self_attn.q_proj.weight": "model-00005-of-00013.safetensors",
214
+ "model.model.layers.29.self_attn.v_proj.weight": "model-00005-of-00013.safetensors",
215
+ "model.model.layers.3.input_layernorm.weight": "model-00005-of-00013.safetensors",
216
+ "model.model.layers.3.mlp.down_proj.weight": "model-00005-of-00013.safetensors",
217
+ "model.model.layers.3.mlp.gate_proj.weight": "model-00005-of-00013.safetensors",
218
+ "model.model.layers.3.mlp.up_proj.weight": "model-00005-of-00013.safetensors",
219
+ "model.model.layers.3.post_attention_layernorm.weight": "model-00005-of-00013.safetensors",
220
+ "model.model.layers.3.self_attn.k_proj.weight": "model-00005-of-00013.safetensors",
221
+ "model.model.layers.3.self_attn.o_proj.weight": "model-00005-of-00013.safetensors",
222
+ "model.model.layers.3.self_attn.q_proj.weight": "model-00005-of-00013.safetensors",
223
+ "model.model.layers.3.self_attn.v_proj.weight": "model-00005-of-00013.safetensors",
224
+ "model.model.layers.30.input_layernorm.weight": "model-00005-of-00013.safetensors",
225
+ "model.model.layers.30.mlp.down_proj.weight": "model-00005-of-00013.safetensors",
226
+ "model.model.layers.30.mlp.gate_proj.weight": "model-00005-of-00013.safetensors",
227
+ "model.model.layers.30.mlp.up_proj.weight": "model-00005-of-00013.safetensors",
228
+ "model.model.layers.30.post_attention_layernorm.weight": "model-00005-of-00013.safetensors",
229
+ "model.model.layers.30.self_attn.k_proj.weight": "model-00005-of-00013.safetensors",
230
+ "model.model.layers.30.self_attn.o_proj.weight": "model-00005-of-00013.safetensors",
231
+ "model.model.layers.30.self_attn.q_proj.weight": "model-00005-of-00013.safetensors",
232
+ "model.model.layers.30.self_attn.v_proj.weight": "model-00005-of-00013.safetensors",
233
+ "model.model.layers.31.input_layernorm.weight": "model-00005-of-00013.safetensors",
234
+ "model.model.layers.31.mlp.down_proj.weight": "model-00005-of-00013.safetensors",
235
+ "model.model.layers.31.mlp.gate_proj.weight": "model-00005-of-00013.safetensors",
236
+ "model.model.layers.31.mlp.up_proj.weight": "model-00005-of-00013.safetensors",
237
+ "model.model.layers.31.post_attention_layernorm.weight": "model-00005-of-00013.safetensors",
238
+ "model.model.layers.31.self_attn.k_proj.weight": "model-00005-of-00013.safetensors",
239
+ "model.model.layers.31.self_attn.o_proj.weight": "model-00005-of-00013.safetensors",
240
+ "model.model.layers.31.self_attn.q_proj.weight": "model-00005-of-00013.safetensors",
241
+ "model.model.layers.31.self_attn.v_proj.weight": "model-00005-of-00013.safetensors",
242
+ "model.model.layers.32.input_layernorm.weight": "model-00005-of-00013.safetensors",
243
+ "model.model.layers.32.mlp.down_proj.weight": "model-00005-of-00013.safetensors",
244
+ "model.model.layers.32.mlp.gate_proj.weight": "model-00005-of-00013.safetensors",
245
+ "model.model.layers.32.mlp.up_proj.weight": "model-00005-of-00013.safetensors",
246
+ "model.model.layers.32.post_attention_layernorm.weight": "model-00005-of-00013.safetensors",
247
+ "model.model.layers.32.self_attn.k_proj.weight": "model-00005-of-00013.safetensors",
248
+ "model.model.layers.32.self_attn.o_proj.weight": "model-00005-of-00013.safetensors",
249
+ "model.model.layers.32.self_attn.q_proj.weight": "model-00005-of-00013.safetensors",
250
+ "model.model.layers.32.self_attn.v_proj.weight": "model-00005-of-00013.safetensors",
251
+ "model.model.layers.33.input_layernorm.weight": "model-00005-of-00013.safetensors",
252
+ "model.model.layers.33.mlp.down_proj.weight": "model-00006-of-00013.safetensors",
253
+ "model.model.layers.33.mlp.gate_proj.weight": "model-00006-of-00013.safetensors",
254
+ "model.model.layers.33.mlp.up_proj.weight": "model-00006-of-00013.safetensors",
255
+ "model.model.layers.33.post_attention_layernorm.weight": "model-00006-of-00013.safetensors",
256
+ "model.model.layers.33.self_attn.k_proj.weight": "model-00006-of-00013.safetensors",
257
+ "model.model.layers.33.self_attn.o_proj.weight": "model-00006-of-00013.safetensors",
258
+ "model.model.layers.33.self_attn.q_proj.weight": "model-00006-of-00013.safetensors",
259
+ "model.model.layers.33.self_attn.v_proj.weight": "model-00006-of-00013.safetensors",
260
+ "model.model.layers.34.input_layernorm.weight": "model-00006-of-00013.safetensors",
261
+ "model.model.layers.34.mlp.down_proj.weight": "model-00006-of-00013.safetensors",
262
+ "model.model.layers.34.mlp.gate_proj.weight": "model-00006-of-00013.safetensors",
263
+ "model.model.layers.34.mlp.up_proj.weight": "model-00006-of-00013.safetensors",
264
+ "model.model.layers.34.post_attention_layernorm.weight": "model-00006-of-00013.safetensors",
265
+ "model.model.layers.34.self_attn.k_proj.weight": "model-00006-of-00013.safetensors",
266
+ "model.model.layers.34.self_attn.o_proj.weight": "model-00006-of-00013.safetensors",
267
+ "model.model.layers.34.self_attn.q_proj.weight": "model-00006-of-00013.safetensors",
268
+ "model.model.layers.34.self_attn.v_proj.weight": "model-00006-of-00013.safetensors",
269
+ "model.model.layers.35.input_layernorm.weight": "model-00006-of-00013.safetensors",
270
+ "model.model.layers.35.mlp.down_proj.weight": "model-00006-of-00013.safetensors",
271
+ "model.model.layers.35.mlp.gate_proj.weight": "model-00006-of-00013.safetensors",
272
+ "model.model.layers.35.mlp.up_proj.weight": "model-00006-of-00013.safetensors",
273
+ "model.model.layers.35.post_attention_layernorm.weight": "model-00006-of-00013.safetensors",
274
+ "model.model.layers.35.self_attn.k_proj.weight": "model-00006-of-00013.safetensors",
275
+ "model.model.layers.35.self_attn.o_proj.weight": "model-00006-of-00013.safetensors",
276
+ "model.model.layers.35.self_attn.q_proj.weight": "model-00006-of-00013.safetensors",
277
+ "model.model.layers.35.self_attn.v_proj.weight": "model-00006-of-00013.safetensors",
278
+ "model.model.layers.36.input_layernorm.weight": "model-00006-of-00013.safetensors",
279
+ "model.model.layers.36.mlp.down_proj.weight": "model-00006-of-00013.safetensors",
280
+ "model.model.layers.36.mlp.gate_proj.weight": "model-00006-of-00013.safetensors",
281
+ "model.model.layers.36.mlp.up_proj.weight": "model-00006-of-00013.safetensors",
282
+ "model.model.layers.36.post_attention_layernorm.weight": "model-00006-of-00013.safetensors",
283
+ "model.model.layers.36.self_attn.k_proj.weight": "model-00006-of-00013.safetensors",
284
+ "model.model.layers.36.self_attn.o_proj.weight": "model-00006-of-00013.safetensors",
285
+ "model.model.layers.36.self_attn.q_proj.weight": "model-00006-of-00013.safetensors",
286
+ "model.model.layers.36.self_attn.v_proj.weight": "model-00006-of-00013.safetensors",
287
+ "model.model.layers.37.input_layernorm.weight": "model-00006-of-00013.safetensors",
288
+ "model.model.layers.37.mlp.down_proj.weight": "model-00006-of-00013.safetensors",
289
+ "model.model.layers.37.mlp.gate_proj.weight": "model-00006-of-00013.safetensors",
290
+ "model.model.layers.37.mlp.up_proj.weight": "model-00006-of-00013.safetensors",
291
+ "model.model.layers.37.post_attention_layernorm.weight": "model-00006-of-00013.safetensors",
292
+ "model.model.layers.37.self_attn.k_proj.weight": "model-00006-of-00013.safetensors",
293
+ "model.model.layers.37.self_attn.o_proj.weight": "model-00006-of-00013.safetensors",
294
+ "model.model.layers.37.self_attn.q_proj.weight": "model-00006-of-00013.safetensors",
295
+ "model.model.layers.37.self_attn.v_proj.weight": "model-00006-of-00013.safetensors",
296
+ "model.model.layers.38.input_layernorm.weight": "model-00006-of-00013.safetensors",
297
+ "model.model.layers.38.mlp.down_proj.weight": "model-00006-of-00013.safetensors",
298
+ "model.model.layers.38.mlp.gate_proj.weight": "model-00006-of-00013.safetensors",
299
+ "model.model.layers.38.mlp.up_proj.weight": "model-00006-of-00013.safetensors",
300
+ "model.model.layers.38.post_attention_layernorm.weight": "model-00006-of-00013.safetensors",
301
+ "model.model.layers.38.self_attn.k_proj.weight": "model-00006-of-00013.safetensors",
302
+ "model.model.layers.38.self_attn.o_proj.weight": "model-00006-of-00013.safetensors",
303
+ "model.model.layers.38.self_attn.q_proj.weight": "model-00006-of-00013.safetensors",
304
+ "model.model.layers.38.self_attn.v_proj.weight": "model-00006-of-00013.safetensors",
305
+ "model.model.layers.39.input_layernorm.weight": "model-00006-of-00013.safetensors",
306
+ "model.model.layers.39.mlp.down_proj.weight": "model-00007-of-00013.safetensors",
307
+ "model.model.layers.39.mlp.gate_proj.weight": "model-00007-of-00013.safetensors",
308
+ "model.model.layers.39.mlp.up_proj.weight": "model-00007-of-00013.safetensors",
309
+ "model.model.layers.39.post_attention_layernorm.weight": "model-00007-of-00013.safetensors",
310
+ "model.model.layers.39.self_attn.k_proj.weight": "model-00007-of-00013.safetensors",
311
+ "model.model.layers.39.self_attn.o_proj.weight": "model-00007-of-00013.safetensors",
312
+ "model.model.layers.39.self_attn.q_proj.weight": "model-00007-of-00013.safetensors",
313
+ "model.model.layers.39.self_attn.v_proj.weight": "model-00007-of-00013.safetensors",
314
+ "model.model.layers.4.input_layernorm.weight": "model-00007-of-00013.safetensors",
315
+ "model.model.layers.4.mlp.down_proj.weight": "model-00007-of-00013.safetensors",
316
+ "model.model.layers.4.mlp.gate_proj.weight": "model-00007-of-00013.safetensors",
317
+ "model.model.layers.4.mlp.up_proj.weight": "model-00007-of-00013.safetensors",
318
+ "model.model.layers.4.post_attention_layernorm.weight": "model-00007-of-00013.safetensors",
319
+ "model.model.layers.4.self_attn.k_proj.weight": "model-00007-of-00013.safetensors",
320
+ "model.model.layers.4.self_attn.o_proj.weight": "model-00007-of-00013.safetensors",
321
+ "model.model.layers.4.self_attn.q_proj.weight": "model-00007-of-00013.safetensors",
322
+ "model.model.layers.4.self_attn.v_proj.weight": "model-00007-of-00013.safetensors",
323
+ "model.model.layers.40.input_layernorm.weight": "model-00007-of-00013.safetensors",
324
+ "model.model.layers.40.mlp.down_proj.weight": "model-00007-of-00013.safetensors",
325
+ "model.model.layers.40.mlp.gate_proj.weight": "model-00007-of-00013.safetensors",
326
+ "model.model.layers.40.mlp.up_proj.weight": "model-00007-of-00013.safetensors",
327
+ "model.model.layers.40.post_attention_layernorm.weight": "model-00007-of-00013.safetensors",
328
+ "model.model.layers.40.self_attn.k_proj.weight": "model-00007-of-00013.safetensors",
329
+ "model.model.layers.40.self_attn.o_proj.weight": "model-00007-of-00013.safetensors",
330
+ "model.model.layers.40.self_attn.q_proj.weight": "model-00007-of-00013.safetensors",
331
+ "model.model.layers.40.self_attn.v_proj.weight": "model-00007-of-00013.safetensors",
332
+ "model.model.layers.41.input_layernorm.weight": "model-00007-of-00013.safetensors",
333
+ "model.model.layers.41.mlp.down_proj.weight": "model-00007-of-00013.safetensors",
334
+ "model.model.layers.41.mlp.gate_proj.weight": "model-00007-of-00013.safetensors",
335
+ "model.model.layers.41.mlp.up_proj.weight": "model-00007-of-00013.safetensors",
336
+ "model.model.layers.41.post_attention_layernorm.weight": "model-00007-of-00013.safetensors",
337
+ "model.model.layers.41.self_attn.k_proj.weight": "model-00007-of-00013.safetensors",
338
+ "model.model.layers.41.self_attn.o_proj.weight": "model-00007-of-00013.safetensors",
339
+ "model.model.layers.41.self_attn.q_proj.weight": "model-00007-of-00013.safetensors",
340
+ "model.model.layers.41.self_attn.v_proj.weight": "model-00007-of-00013.safetensors",
341
+ "model.model.layers.42.input_layernorm.weight": "model-00007-of-00013.safetensors",
342
+ "model.model.layers.42.mlp.down_proj.weight": "model-00007-of-00013.safetensors",
343
+ "model.model.layers.42.mlp.gate_proj.weight": "model-00007-of-00013.safetensors",
344
+ "model.model.layers.42.mlp.up_proj.weight": "model-00007-of-00013.safetensors",
345
+ "model.model.layers.42.post_attention_layernorm.weight": "model-00007-of-00013.safetensors",
346
+ "model.model.layers.42.self_attn.k_proj.weight": "model-00007-of-00013.safetensors",
347
+ "model.model.layers.42.self_attn.o_proj.weight": "model-00007-of-00013.safetensors",
348
+ "model.model.layers.42.self_attn.q_proj.weight": "model-00007-of-00013.safetensors",
349
+ "model.model.layers.42.self_attn.v_proj.weight": "model-00007-of-00013.safetensors",
350
+ "model.model.layers.43.input_layernorm.weight": "model-00007-of-00013.safetensors",
351
+ "model.model.layers.43.mlp.down_proj.weight": "model-00007-of-00013.safetensors",
352
+ "model.model.layers.43.mlp.gate_proj.weight": "model-00007-of-00013.safetensors",
353
+ "model.model.layers.43.mlp.up_proj.weight": "model-00007-of-00013.safetensors",
354
+ "model.model.layers.43.post_attention_layernorm.weight": "model-00007-of-00013.safetensors",
355
+ "model.model.layers.43.self_attn.k_proj.weight": "model-00007-of-00013.safetensors",
356
+ "model.model.layers.43.self_attn.o_proj.weight": "model-00007-of-00013.safetensors",
357
+ "model.model.layers.43.self_attn.q_proj.weight": "model-00007-of-00013.safetensors",
358
+ "model.model.layers.43.self_attn.v_proj.weight": "model-00007-of-00013.safetensors",
359
+ "model.model.layers.44.input_layernorm.weight": "model-00007-of-00013.safetensors",
360
+ "model.model.layers.44.mlp.down_proj.weight": "model-00008-of-00013.safetensors",
361
+ "model.model.layers.44.mlp.gate_proj.weight": "model-00008-of-00013.safetensors",
362
+ "model.model.layers.44.mlp.up_proj.weight": "model-00008-of-00013.safetensors",
363
+ "model.model.layers.44.post_attention_layernorm.weight": "model-00008-of-00013.safetensors",
364
+ "model.model.layers.44.self_attn.k_proj.weight": "model-00008-of-00013.safetensors",
365
+ "model.model.layers.44.self_attn.o_proj.weight": "model-00008-of-00013.safetensors",
366
+ "model.model.layers.44.self_attn.q_proj.weight": "model-00008-of-00013.safetensors",
367
+ "model.model.layers.44.self_attn.v_proj.weight": "model-00008-of-00013.safetensors",
368
+ "model.model.layers.45.input_layernorm.weight": "model-00008-of-00013.safetensors",
369
+ "model.model.layers.45.mlp.down_proj.weight": "model-00008-of-00013.safetensors",
370
+ "model.model.layers.45.mlp.gate_proj.weight": "model-00008-of-00013.safetensors",
371
+ "model.model.layers.45.mlp.up_proj.weight": "model-00008-of-00013.safetensors",
372
+ "model.model.layers.45.post_attention_layernorm.weight": "model-00008-of-00013.safetensors",
373
+ "model.model.layers.45.self_attn.k_proj.weight": "model-00008-of-00013.safetensors",
374
+ "model.model.layers.45.self_attn.o_proj.weight": "model-00008-of-00013.safetensors",
375
+ "model.model.layers.45.self_attn.q_proj.weight": "model-00008-of-00013.safetensors",
376
+ "model.model.layers.45.self_attn.v_proj.weight": "model-00008-of-00013.safetensors",
377
+ "model.model.layers.46.input_layernorm.weight": "model-00008-of-00013.safetensors",
378
+ "model.model.layers.46.mlp.down_proj.weight": "model-00008-of-00013.safetensors",
379
+ "model.model.layers.46.mlp.gate_proj.weight": "model-00008-of-00013.safetensors",
380
+ "model.model.layers.46.mlp.up_proj.weight": "model-00008-of-00013.safetensors",
381
+ "model.model.layers.46.post_attention_layernorm.weight": "model-00008-of-00013.safetensors",
382
+ "model.model.layers.46.self_attn.k_proj.weight": "model-00008-of-00013.safetensors",
383
+ "model.model.layers.46.self_attn.o_proj.weight": "model-00008-of-00013.safetensors",
384
+ "model.model.layers.46.self_attn.q_proj.weight": "model-00008-of-00013.safetensors",
385
+ "model.model.layers.46.self_attn.v_proj.weight": "model-00008-of-00013.safetensors",
386
+ "model.model.layers.47.input_layernorm.weight": "model-00008-of-00013.safetensors",
387
+ "model.model.layers.47.mlp.down_proj.weight": "model-00008-of-00013.safetensors",
388
+ "model.model.layers.47.mlp.gate_proj.weight": "model-00008-of-00013.safetensors",
389
+ "model.model.layers.47.mlp.up_proj.weight": "model-00008-of-00013.safetensors",
390
+ "model.model.layers.47.post_attention_layernorm.weight": "model-00008-of-00013.safetensors",
391
+ "model.model.layers.47.self_attn.k_proj.weight": "model-00008-of-00013.safetensors",
392
+ "model.model.layers.47.self_attn.o_proj.weight": "model-00008-of-00013.safetensors",
393
+ "model.model.layers.47.self_attn.q_proj.weight": "model-00008-of-00013.safetensors",
394
+ "model.model.layers.47.self_attn.v_proj.weight": "model-00008-of-00013.safetensors",
395
+ "model.model.layers.48.input_layernorm.weight": "model-00008-of-00013.safetensors",
396
+ "model.model.layers.48.mlp.down_proj.weight": "model-00008-of-00013.safetensors",
397
+ "model.model.layers.48.mlp.gate_proj.weight": "model-00008-of-00013.safetensors",
398
+ "model.model.layers.48.mlp.up_proj.weight": "model-00008-of-00013.safetensors",
399
+ "model.model.layers.48.post_attention_layernorm.weight": "model-00008-of-00013.safetensors",
400
+ "model.model.layers.48.self_attn.k_proj.weight": "model-00008-of-00013.safetensors",
401
+ "model.model.layers.48.self_attn.o_proj.weight": "model-00008-of-00013.safetensors",
402
+ "model.model.layers.48.self_attn.q_proj.weight": "model-00008-of-00013.safetensors",
403
+ "model.model.layers.48.self_attn.v_proj.weight": "model-00008-of-00013.safetensors",
404
+ "model.model.layers.49.input_layernorm.weight": "model-00008-of-00013.safetensors",
405
+ "model.model.layers.49.mlp.down_proj.weight": "model-00008-of-00013.safetensors",
406
+ "model.model.layers.49.mlp.gate_proj.weight": "model-00008-of-00013.safetensors",
407
+ "model.model.layers.49.mlp.up_proj.weight": "model-00008-of-00013.safetensors",
408
+ "model.model.layers.49.post_attention_layernorm.weight": "model-00008-of-00013.safetensors",
409
+ "model.model.layers.49.self_attn.k_proj.weight": "model-00008-of-00013.safetensors",
410
+ "model.model.layers.49.self_attn.o_proj.weight": "model-00008-of-00013.safetensors",
411
+ "model.model.layers.49.self_attn.q_proj.weight": "model-00008-of-00013.safetensors",
412
+ "model.model.layers.49.self_attn.v_proj.weight": "model-00008-of-00013.safetensors",
413
+ "model.model.layers.5.input_layernorm.weight": "model-00008-of-00013.safetensors",
414
+ "model.model.layers.5.mlp.down_proj.weight": "model-00009-of-00013.safetensors",
415
+ "model.model.layers.5.mlp.gate_proj.weight": "model-00009-of-00013.safetensors",
416
+ "model.model.layers.5.mlp.up_proj.weight": "model-00009-of-00013.safetensors",
417
+ "model.model.layers.5.post_attention_layernorm.weight": "model-00009-of-00013.safetensors",
418
+ "model.model.layers.5.self_attn.k_proj.weight": "model-00009-of-00013.safetensors",
419
+ "model.model.layers.5.self_attn.o_proj.weight": "model-00009-of-00013.safetensors",
420
+ "model.model.layers.5.self_attn.q_proj.weight": "model-00009-of-00013.safetensors",
421
+ "model.model.layers.5.self_attn.v_proj.weight": "model-00009-of-00013.safetensors",
422
+ "model.model.layers.50.input_layernorm.weight": "model-00009-of-00013.safetensors",
423
+ "model.model.layers.50.mlp.down_proj.weight": "model-00009-of-00013.safetensors",
424
+ "model.model.layers.50.mlp.gate_proj.weight": "model-00009-of-00013.safetensors",
425
+ "model.model.layers.50.mlp.up_proj.weight": "model-00009-of-00013.safetensors",
426
+ "model.model.layers.50.post_attention_layernorm.weight": "model-00009-of-00013.safetensors",
427
+ "model.model.layers.50.self_attn.k_proj.weight": "model-00009-of-00013.safetensors",
428
+ "model.model.layers.50.self_attn.o_proj.weight": "model-00009-of-00013.safetensors",
429
+ "model.model.layers.50.self_attn.q_proj.weight": "model-00009-of-00013.safetensors",
430
+ "model.model.layers.50.self_attn.v_proj.weight": "model-00009-of-00013.safetensors",
431
+ "model.model.layers.51.input_layernorm.weight": "model-00009-of-00013.safetensors",
432
+ "model.model.layers.51.mlp.down_proj.weight": "model-00009-of-00013.safetensors",
433
+ "model.model.layers.51.mlp.gate_proj.weight": "model-00009-of-00013.safetensors",
434
+ "model.model.layers.51.mlp.up_proj.weight": "model-00009-of-00013.safetensors",
435
+ "model.model.layers.51.post_attention_layernorm.weight": "model-00009-of-00013.safetensors",
436
+ "model.model.layers.51.self_attn.k_proj.weight": "model-00009-of-00013.safetensors",
437
+ "model.model.layers.51.self_attn.o_proj.weight": "model-00009-of-00013.safetensors",
438
+ "model.model.layers.51.self_attn.q_proj.weight": "model-00009-of-00013.safetensors",
439
+ "model.model.layers.51.self_attn.v_proj.weight": "model-00009-of-00013.safetensors",
440
+ "model.model.layers.52.input_layernorm.weight": "model-00009-of-00013.safetensors",
441
+ "model.model.layers.52.mlp.down_proj.weight": "model-00009-of-00013.safetensors",
442
+ "model.model.layers.52.mlp.gate_proj.weight": "model-00009-of-00013.safetensors",
443
+ "model.model.layers.52.mlp.up_proj.weight": "model-00009-of-00013.safetensors",
444
+ "model.model.layers.52.post_attention_layernorm.weight": "model-00009-of-00013.safetensors",
445
+ "model.model.layers.52.self_attn.k_proj.weight": "model-00009-of-00013.safetensors",
446
+ "model.model.layers.52.self_attn.o_proj.weight": "model-00009-of-00013.safetensors",
447
+ "model.model.layers.52.self_attn.q_proj.weight": "model-00009-of-00013.safetensors",
448
+ "model.model.layers.52.self_attn.v_proj.weight": "model-00009-of-00013.safetensors",
449
+ "model.model.layers.53.input_layernorm.weight": "model-00009-of-00013.safetensors",
450
+ "model.model.layers.53.mlp.down_proj.weight": "model-00009-of-00013.safetensors",
451
+ "model.model.layers.53.mlp.gate_proj.weight": "model-00009-of-00013.safetensors",
452
+ "model.model.layers.53.mlp.up_proj.weight": "model-00009-of-00013.safetensors",
453
+ "model.model.layers.53.post_attention_layernorm.weight": "model-00009-of-00013.safetensors",
454
+ "model.model.layers.53.self_attn.k_proj.weight": "model-00009-of-00013.safetensors",
455
+ "model.model.layers.53.self_attn.o_proj.weight": "model-00009-of-00013.safetensors",
456
+ "model.model.layers.53.self_attn.q_proj.weight": "model-00009-of-00013.safetensors",
457
+ "model.model.layers.53.self_attn.v_proj.weight": "model-00009-of-00013.safetensors",
458
+ "model.model.layers.54.input_layernorm.weight": "model-00009-of-00013.safetensors",
459
+ "model.model.layers.54.mlp.down_proj.weight": "model-00009-of-00013.safetensors",
460
+ "model.model.layers.54.mlp.gate_proj.weight": "model-00009-of-00013.safetensors",
461
+ "model.model.layers.54.mlp.up_proj.weight": "model-00009-of-00013.safetensors",
462
+ "model.model.layers.54.post_attention_layernorm.weight": "model-00009-of-00013.safetensors",
463
+ "model.model.layers.54.self_attn.k_proj.weight": "model-00009-of-00013.safetensors",
464
+ "model.model.layers.54.self_attn.o_proj.weight": "model-00009-of-00013.safetensors",
465
+ "model.model.layers.54.self_attn.q_proj.weight": "model-00009-of-00013.safetensors",
466
+ "model.model.layers.54.self_attn.v_proj.weight": "model-00009-of-00013.safetensors",
467
+ "model.model.layers.55.input_layernorm.weight": "model-00009-of-00013.safetensors",
468
+ "model.model.layers.55.mlp.down_proj.weight": "model-00010-of-00013.safetensors",
469
+ "model.model.layers.55.mlp.gate_proj.weight": "model-00010-of-00013.safetensors",
470
+ "model.model.layers.55.mlp.up_proj.weight": "model-00010-of-00013.safetensors",
471
+ "model.model.layers.55.post_attention_layernorm.weight": "model-00010-of-00013.safetensors",
472
+ "model.model.layers.55.self_attn.k_proj.weight": "model-00010-of-00013.safetensors",
473
+ "model.model.layers.55.self_attn.o_proj.weight": "model-00010-of-00013.safetensors",
474
+ "model.model.layers.55.self_attn.q_proj.weight": "model-00010-of-00013.safetensors",
475
+ "model.model.layers.55.self_attn.v_proj.weight": "model-00010-of-00013.safetensors",
476
+ "model.model.layers.56.input_layernorm.weight": "model-00010-of-00013.safetensors",
477
+ "model.model.layers.56.mlp.down_proj.weight": "model-00010-of-00013.safetensors",
478
+ "model.model.layers.56.mlp.gate_proj.weight": "model-00010-of-00013.safetensors",
479
+ "model.model.layers.56.mlp.up_proj.weight": "model-00010-of-00013.safetensors",
480
+ "model.model.layers.56.post_attention_layernorm.weight": "model-00010-of-00013.safetensors",
481
+ "model.model.layers.56.self_attn.k_proj.weight": "model-00010-of-00013.safetensors",
482
+ "model.model.layers.56.self_attn.o_proj.weight": "model-00010-of-00013.safetensors",
483
+ "model.model.layers.56.self_attn.q_proj.weight": "model-00010-of-00013.safetensors",
484
+ "model.model.layers.56.self_attn.v_proj.weight": "model-00010-of-00013.safetensors",
485
+ "model.model.layers.57.input_layernorm.weight": "model-00010-of-00013.safetensors",
486
+ "model.model.layers.57.mlp.down_proj.weight": "model-00010-of-00013.safetensors",
487
+ "model.model.layers.57.mlp.gate_proj.weight": "model-00010-of-00013.safetensors",
488
+ "model.model.layers.57.mlp.up_proj.weight": "model-00010-of-00013.safetensors",
489
+ "model.model.layers.57.post_attention_layernorm.weight": "model-00010-of-00013.safetensors",
490
+ "model.model.layers.57.self_attn.k_proj.weight": "model-00010-of-00013.safetensors",
491
+ "model.model.layers.57.self_attn.o_proj.weight": "model-00010-of-00013.safetensors",
492
+ "model.model.layers.57.self_attn.q_proj.weight": "model-00010-of-00013.safetensors",
493
+ "model.model.layers.57.self_attn.v_proj.weight": "model-00010-of-00013.safetensors",
494
+ "model.model.layers.58.input_layernorm.weight": "model-00010-of-00013.safetensors",
495
+ "model.model.layers.58.mlp.down_proj.weight": "model-00010-of-00013.safetensors",
496
+ "model.model.layers.58.mlp.gate_proj.weight": "model-00010-of-00013.safetensors",
497
+ "model.model.layers.58.mlp.up_proj.weight": "model-00010-of-00013.safetensors",
498
+ "model.model.layers.58.post_attention_layernorm.weight": "model-00010-of-00013.safetensors",
499
+ "model.model.layers.58.self_attn.k_proj.weight": "model-00010-of-00013.safetensors",
500
+ "model.model.layers.58.self_attn.o_proj.weight": "model-00010-of-00013.safetensors",
501
+ "model.model.layers.58.self_attn.q_proj.weight": "model-00010-of-00013.safetensors",
502
+ "model.model.layers.58.self_attn.v_proj.weight": "model-00010-of-00013.safetensors",
503
+ "model.model.layers.59.input_layernorm.weight": "model-00010-of-00013.safetensors",
504
+ "model.model.layers.59.mlp.down_proj.weight": "model-00010-of-00013.safetensors",
505
+ "model.model.layers.59.mlp.gate_proj.weight": "model-00010-of-00013.safetensors",
506
+ "model.model.layers.59.mlp.up_proj.weight": "model-00010-of-00013.safetensors",
507
+ "model.model.layers.59.post_attention_layernorm.weight": "model-00010-of-00013.safetensors",
508
+ "model.model.layers.59.self_attn.k_proj.weight": "model-00010-of-00013.safetensors",
509
+ "model.model.layers.59.self_attn.o_proj.weight": "model-00010-of-00013.safetensors",
510
+ "model.model.layers.59.self_attn.q_proj.weight": "model-00010-of-00013.safetensors",
511
+ "model.model.layers.59.self_attn.v_proj.weight": "model-00010-of-00013.safetensors",
512
+ "model.model.layers.6.input_layernorm.weight": "model-00010-of-00013.safetensors",
513
+ "model.model.layers.6.mlp.down_proj.weight": "model-00010-of-00013.safetensors",
514
+ "model.model.layers.6.mlp.gate_proj.weight": "model-00010-of-00013.safetensors",
515
+ "model.model.layers.6.mlp.up_proj.weight": "model-00010-of-00013.safetensors",
516
+ "model.model.layers.6.post_attention_layernorm.weight": "model-00010-of-00013.safetensors",
517
+ "model.model.layers.6.self_attn.k_proj.weight": "model-00010-of-00013.safetensors",
518
+ "model.model.layers.6.self_attn.o_proj.weight": "model-00010-of-00013.safetensors",
519
+ "model.model.layers.6.self_attn.q_proj.weight": "model-00010-of-00013.safetensors",
520
+ "model.model.layers.6.self_attn.v_proj.weight": "model-00010-of-00013.safetensors",
521
+ "model.model.layers.60.input_layernorm.weight": "model-00010-of-00013.safetensors",
522
+ "model.model.layers.60.mlp.down_proj.weight": "model-00011-of-00013.safetensors",
523
+ "model.model.layers.60.mlp.gate_proj.weight": "model-00011-of-00013.safetensors",
524
+ "model.model.layers.60.mlp.up_proj.weight": "model-00011-of-00013.safetensors",
525
+ "model.model.layers.60.post_attention_layernorm.weight": "model-00011-of-00013.safetensors",
526
+ "model.model.layers.60.self_attn.k_proj.weight": "model-00011-of-00013.safetensors",
527
+ "model.model.layers.60.self_attn.o_proj.weight": "model-00011-of-00013.safetensors",
528
+ "model.model.layers.60.self_attn.q_proj.weight": "model-00011-of-00013.safetensors",
529
+ "model.model.layers.60.self_attn.v_proj.weight": "model-00011-of-00013.safetensors",
530
+ "model.model.layers.61.input_layernorm.weight": "model-00011-of-00013.safetensors",
531
+ "model.model.layers.61.mlp.down_proj.weight": "model-00011-of-00013.safetensors",
532
+ "model.model.layers.61.mlp.gate_proj.weight": "model-00011-of-00013.safetensors",
533
+ "model.model.layers.61.mlp.up_proj.weight": "model-00011-of-00013.safetensors",
534
+ "model.model.layers.61.post_attention_layernorm.weight": "model-00011-of-00013.safetensors",
535
+ "model.model.layers.61.self_attn.k_proj.weight": "model-00011-of-00013.safetensors",
536
+ "model.model.layers.61.self_attn.o_proj.weight": "model-00011-of-00013.safetensors",
537
+ "model.model.layers.61.self_attn.q_proj.weight": "model-00011-of-00013.safetensors",
538
+ "model.model.layers.61.self_attn.v_proj.weight": "model-00011-of-00013.safetensors",
539
+ "model.model.layers.62.input_layernorm.weight": "model-00011-of-00013.safetensors",
540
+ "model.model.layers.62.mlp.down_proj.weight": "model-00011-of-00013.safetensors",
541
+ "model.model.layers.62.mlp.gate_proj.weight": "model-00011-of-00013.safetensors",
542
+ "model.model.layers.62.mlp.up_proj.weight": "model-00011-of-00013.safetensors",
543
+ "model.model.layers.62.post_attention_layernorm.weight": "model-00011-of-00013.safetensors",
544
+ "model.model.layers.62.self_attn.k_proj.weight": "model-00011-of-00013.safetensors",
545
+ "model.model.layers.62.self_attn.o_proj.weight": "model-00011-of-00013.safetensors",
546
+ "model.model.layers.62.self_attn.q_proj.weight": "model-00011-of-00013.safetensors",
547
+ "model.model.layers.62.self_attn.v_proj.weight": "model-00011-of-00013.safetensors",
548
+ "model.model.layers.63.input_layernorm.weight": "model-00011-of-00013.safetensors",
549
+ "model.model.layers.63.mlp.down_proj.weight": "model-00011-of-00013.safetensors",
550
+ "model.model.layers.63.mlp.gate_proj.weight": "model-00011-of-00013.safetensors",
551
+ "model.model.layers.63.mlp.up_proj.weight": "model-00011-of-00013.safetensors",
552
+ "model.model.layers.63.post_attention_layernorm.weight": "model-00011-of-00013.safetensors",
553
+ "model.model.layers.63.self_attn.k_proj.weight": "model-00011-of-00013.safetensors",
554
+ "model.model.layers.63.self_attn.o_proj.weight": "model-00011-of-00013.safetensors",
555
+ "model.model.layers.63.self_attn.q_proj.weight": "model-00011-of-00013.safetensors",
556
+ "model.model.layers.63.self_attn.v_proj.weight": "model-00011-of-00013.safetensors",
557
+ "model.model.layers.64.input_layernorm.weight": "model-00011-of-00013.safetensors",
558
+ "model.model.layers.64.mlp.down_proj.weight": "model-00011-of-00013.safetensors",
559
+ "model.model.layers.64.mlp.gate_proj.weight": "model-00011-of-00013.safetensors",
560
+ "model.model.layers.64.mlp.up_proj.weight": "model-00011-of-00013.safetensors",
561
+ "model.model.layers.64.post_attention_layernorm.weight": "model-00011-of-00013.safetensors",
562
+ "model.model.layers.64.self_attn.k_proj.weight": "model-00011-of-00013.safetensors",
563
+ "model.model.layers.64.self_attn.o_proj.weight": "model-00011-of-00013.safetensors",
564
+ "model.model.layers.64.self_attn.q_proj.weight": "model-00011-of-00013.safetensors",
565
+ "model.model.layers.64.self_attn.v_proj.weight": "model-00011-of-00013.safetensors",
566
+ "model.model.layers.65.input_layernorm.weight": "model-00011-of-00013.safetensors",
567
+ "model.model.layers.65.mlp.down_proj.weight": "model-00011-of-00013.safetensors",
568
+ "model.model.layers.65.mlp.gate_proj.weight": "model-00011-of-00013.safetensors",
569
+ "model.model.layers.65.mlp.up_proj.weight": "model-00011-of-00013.safetensors",
570
+ "model.model.layers.65.post_attention_layernorm.weight": "model-00011-of-00013.safetensors",
571
+ "model.model.layers.65.self_attn.k_proj.weight": "model-00011-of-00013.safetensors",
572
+ "model.model.layers.65.self_attn.o_proj.weight": "model-00011-of-00013.safetensors",
573
+ "model.model.layers.65.self_attn.q_proj.weight": "model-00011-of-00013.safetensors",
574
+ "model.model.layers.65.self_attn.v_proj.weight": "model-00011-of-00013.safetensors",
575
+ "model.model.layers.66.input_layernorm.weight": "model-00011-of-00013.safetensors",
576
+ "model.model.layers.66.mlp.down_proj.weight": "model-00012-of-00013.safetensors",
577
+ "model.model.layers.66.mlp.gate_proj.weight": "model-00012-of-00013.safetensors",
578
+ "model.model.layers.66.mlp.up_proj.weight": "model-00012-of-00013.safetensors",
579
+ "model.model.layers.66.post_attention_layernorm.weight": "model-00012-of-00013.safetensors",
580
+ "model.model.layers.66.self_attn.k_proj.weight": "model-00012-of-00013.safetensors",
581
+ "model.model.layers.66.self_attn.o_proj.weight": "model-00012-of-00013.safetensors",
582
+ "model.model.layers.66.self_attn.q_proj.weight": "model-00012-of-00013.safetensors",
583
+ "model.model.layers.66.self_attn.v_proj.weight": "model-00012-of-00013.safetensors",
584
+ "model.model.layers.67.input_layernorm.weight": "model-00012-of-00013.safetensors",
585
+ "model.model.layers.67.mlp.down_proj.weight": "model-00012-of-00013.safetensors",
586
+ "model.model.layers.67.mlp.gate_proj.weight": "model-00012-of-00013.safetensors",
587
+ "model.model.layers.67.mlp.up_proj.weight": "model-00012-of-00013.safetensors",
588
+ "model.model.layers.67.post_attention_layernorm.weight": "model-00012-of-00013.safetensors",
589
+ "model.model.layers.67.self_attn.k_proj.weight": "model-00012-of-00013.safetensors",
590
+ "model.model.layers.67.self_attn.o_proj.weight": "model-00012-of-00013.safetensors",
591
+ "model.model.layers.67.self_attn.q_proj.weight": "model-00012-of-00013.safetensors",
592
+ "model.model.layers.67.self_attn.v_proj.weight": "model-00012-of-00013.safetensors",
593
+ "model.model.layers.68.input_layernorm.weight": "model-00012-of-00013.safetensors",
594
+ "model.model.layers.68.mlp.down_proj.weight": "model-00012-of-00013.safetensors",
595
+ "model.model.layers.68.mlp.gate_proj.weight": "model-00012-of-00013.safetensors",
596
+ "model.model.layers.68.mlp.up_proj.weight": "model-00012-of-00013.safetensors",
597
+ "model.model.layers.68.post_attention_layernorm.weight": "model-00012-of-00013.safetensors",
598
+ "model.model.layers.68.self_attn.k_proj.weight": "model-00012-of-00013.safetensors",
599
+ "model.model.layers.68.self_attn.o_proj.weight": "model-00012-of-00013.safetensors",
600
+ "model.model.layers.68.self_attn.q_proj.weight": "model-00012-of-00013.safetensors",
601
+ "model.model.layers.68.self_attn.v_proj.weight": "model-00012-of-00013.safetensors",
602
+ "model.model.layers.69.input_layernorm.weight": "model-00012-of-00013.safetensors",
603
+ "model.model.layers.69.mlp.down_proj.weight": "model-00012-of-00013.safetensors",
604
+ "model.model.layers.69.mlp.gate_proj.weight": "model-00012-of-00013.safetensors",
605
+ "model.model.layers.69.mlp.up_proj.weight": "model-00012-of-00013.safetensors",
606
+ "model.model.layers.69.post_attention_layernorm.weight": "model-00012-of-00013.safetensors",
607
+ "model.model.layers.69.self_attn.k_proj.weight": "model-00012-of-00013.safetensors",
608
+ "model.model.layers.69.self_attn.o_proj.weight": "model-00012-of-00013.safetensors",
609
+ "model.model.layers.69.self_attn.q_proj.weight": "model-00012-of-00013.safetensors",
610
+ "model.model.layers.69.self_attn.v_proj.weight": "model-00012-of-00013.safetensors",
611
+ "model.model.layers.7.input_layernorm.weight": "model-00012-of-00013.safetensors",
612
+ "model.model.layers.7.mlp.down_proj.weight": "model-00012-of-00013.safetensors",
613
+ "model.model.layers.7.mlp.gate_proj.weight": "model-00012-of-00013.safetensors",
614
+ "model.model.layers.7.mlp.up_proj.weight": "model-00012-of-00013.safetensors",
615
+ "model.model.layers.7.post_attention_layernorm.weight": "model-00012-of-00013.safetensors",
616
+ "model.model.layers.7.self_attn.k_proj.weight": "model-00012-of-00013.safetensors",
617
+ "model.model.layers.7.self_attn.o_proj.weight": "model-00012-of-00013.safetensors",
618
+ "model.model.layers.7.self_attn.q_proj.weight": "model-00012-of-00013.safetensors",
619
+ "model.model.layers.7.self_attn.v_proj.weight": "model-00012-of-00013.safetensors",
620
+ "model.model.layers.70.input_layernorm.weight": "model-00012-of-00013.safetensors",
621
+ "model.model.layers.70.mlp.down_proj.weight": "model-00012-of-00013.safetensors",
622
+ "model.model.layers.70.mlp.gate_proj.weight": "model-00012-of-00013.safetensors",
623
+ "model.model.layers.70.mlp.up_proj.weight": "model-00012-of-00013.safetensors",
624
+ "model.model.layers.70.post_attention_layernorm.weight": "model-00012-of-00013.safetensors",
625
+ "model.model.layers.70.self_attn.k_proj.weight": "model-00012-of-00013.safetensors",
626
+ "model.model.layers.70.self_attn.o_proj.weight": "model-00012-of-00013.safetensors",
627
+ "model.model.layers.70.self_attn.q_proj.weight": "model-00012-of-00013.safetensors",
628
+ "model.model.layers.70.self_attn.v_proj.weight": "model-00012-of-00013.safetensors",
629
+ "model.model.layers.71.input_layernorm.weight": "model-00012-of-00013.safetensors",
630
+ "model.model.layers.71.mlp.down_proj.weight": "model-00013-of-00013.safetensors",
631
+ "model.model.layers.71.mlp.gate_proj.weight": "model-00013-of-00013.safetensors",
632
+ "model.model.layers.71.mlp.up_proj.weight": "model-00013-of-00013.safetensors",
633
+ "model.model.layers.71.post_attention_layernorm.weight": "model-00013-of-00013.safetensors",
634
+ "model.model.layers.71.self_attn.k_proj.weight": "model-00013-of-00013.safetensors",
635
+ "model.model.layers.71.self_attn.o_proj.weight": "model-00013-of-00013.safetensors",
636
+ "model.model.layers.71.self_attn.q_proj.weight": "model-00013-of-00013.safetensors",
637
+ "model.model.layers.71.self_attn.v_proj.weight": "model-00013-of-00013.safetensors",
638
+ "model.model.layers.8.input_layernorm.weight": "model-00013-of-00013.safetensors",
639
+ "model.model.layers.8.mlp.down_proj.weight": "model-00013-of-00013.safetensors",
640
+ "model.model.layers.8.mlp.gate_proj.weight": "model-00013-of-00013.safetensors",
641
+ "model.model.layers.8.mlp.up_proj.weight": "model-00013-of-00013.safetensors",
642
+ "model.model.layers.8.post_attention_layernorm.weight": "model-00013-of-00013.safetensors",
643
+ "model.model.layers.8.self_attn.k_proj.weight": "model-00013-of-00013.safetensors",
644
+ "model.model.layers.8.self_attn.o_proj.weight": "model-00013-of-00013.safetensors",
645
+ "model.model.layers.8.self_attn.q_proj.weight": "model-00013-of-00013.safetensors",
646
+ "model.model.layers.8.self_attn.v_proj.weight": "model-00013-of-00013.safetensors",
647
+ "model.model.layers.9.input_layernorm.weight": "model-00013-of-00013.safetensors",
648
+ "model.model.layers.9.mlp.down_proj.weight": "model-00013-of-00013.safetensors",
649
+ "model.model.layers.9.mlp.gate_proj.weight": "model-00013-of-00013.safetensors",
650
+ "model.model.layers.9.mlp.up_proj.weight": "model-00013-of-00013.safetensors",
651
+ "model.model.layers.9.post_attention_layernorm.weight": "model-00013-of-00013.safetensors",
652
+ "model.model.layers.9.self_attn.k_proj.weight": "model-00013-of-00013.safetensors",
653
+ "model.model.layers.9.self_attn.o_proj.weight": "model-00013-of-00013.safetensors",
654
+ "model.model.layers.9.self_attn.q_proj.weight": "model-00013-of-00013.safetensors",
655
+ "model.model.layers.9.self_attn.v_proj.weight": "model-00013-of-00013.safetensors",
656
+ "model.model.norm.weight": "model-00013-of-00013.safetensors"
657
+ }
658
+ }
modeling_hyperclovax.py ADDED
@@ -0,0 +1,1866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ import math
21
+ from typing import List, Optional, Tuple, Union
22
+
23
+ import torch
24
+ import torch.nn.functional as F
25
+ import torch.utils.checkpoint
26
+ from torch import nn
27
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
28
+ from transformers.activations import ACT2FN
29
+ from transformers.cache_utils import Cache, DynamicCache, StaticCache
30
+ from transformers.generation import GenerationMixin
31
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
32
+ from transformers.modeling_flash_attention_utils import _flash_attention_forward
33
+ from transformers.modeling_outputs import (
34
+ BaseModelOutputWithPast,
35
+ CausalLMOutputWithPast,
36
+ QuestionAnsweringModelOutput,
37
+ SequenceClassifierOutputWithPast,
38
+ TokenClassifierOutput,
39
+ )
40
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
41
+ from transformers.modeling_utils import PreTrainedModel
42
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
43
+ from transformers.utils import (
44
+ add_start_docstrings,
45
+ add_start_docstrings_to_model_forward,
46
+ is_flash_attn_greater_or_equal_2_10,
47
+ is_torchdynamo_compiling,
48
+ logging,
49
+ replace_return_docstrings,
50
+ )
51
+
52
+ from .configuration_hyperclovax import HyperCLOVAXConfig
53
+
54
+ logger = logging.get_logger(__name__)
55
+
56
+ _CONFIG_FOR_DOC = "HyperCLOVAXConfig"
57
+
58
+
59
+ def _prepare_4d_causal_attention_mask_with_cache_position(
60
+ attention_mask: torch.Tensor,
61
+ sequence_length: int,
62
+ target_length: int,
63
+ dtype: torch.dtype,
64
+ device: torch.device,
65
+ min_dtype: float,
66
+ cache_position: torch.Tensor,
67
+ batch_size: int,
68
+ ):
69
+ """
70
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
71
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
72
+
73
+ Args:
74
+ attention_mask (`torch.Tensor`):
75
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
76
+ sequence_length (`int`):
77
+ The sequence length being processed.
78
+ target_length (`int`):
79
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
80
+ dtype (`torch.dtype`):
81
+ The dtype to use for the 4D attention mask.
82
+ device (`torch.device`):
83
+ The device to plcae the 4D attention mask on.
84
+ min_dtype (`float`):
85
+ The minimum value representable with the dtype `dtype`.
86
+ cache_position (`torch.Tensor`):
87
+ Indices depicting the position of the input sequence tokens in the sequence.
88
+ batch_size (`torch.Tensor`):
89
+ Batch size.
90
+ """
91
+ if attention_mask is not None and attention_mask.dim() == 4:
92
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
93
+ causal_mask = attention_mask
94
+ else:
95
+ causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
96
+ if sequence_length != 1:
97
+ causal_mask = torch.triu(causal_mask, diagonal=1)
98
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
99
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
100
+ if attention_mask is not None:
101
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
102
+ mask_length = attention_mask.shape[-1]
103
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
104
+ padding_mask = padding_mask == 0
105
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
106
+
107
+ return causal_mask
108
+
109
+
110
+ class HyperCLOVAXRMSNorm(nn.Module):
111
+ def __init__(self, hidden_size, eps=1e-6):
112
+ """
113
+ HyperCLOVAXRMSNorm is equivalent to T5LayerNorm
114
+ """
115
+ super().__init__()
116
+ self.weight = nn.Parameter(torch.ones(hidden_size))
117
+ self.variance_epsilon = eps
118
+
119
+ def forward(self, hidden_states):
120
+ input_dtype = hidden_states.dtype
121
+ hidden_states = hidden_states.to(torch.float32)
122
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
123
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
124
+ return self.weight * hidden_states.to(input_dtype)
125
+
126
+ def extra_repr(self):
127
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
128
+
129
+
130
+ ALL_LAYERNORM_LAYERS.append(HyperCLOVAXRMSNorm)
131
+
132
+
133
+ class HyperCLOVAXRotaryEmbedding(nn.Module):
134
+ def __init__(
135
+ self,
136
+ dim=None,
137
+ max_position_embeddings=2048,
138
+ base=10000,
139
+ device=None,
140
+ scaling_factor=1.0,
141
+ rope_type="default",
142
+ config: Optional[HyperCLOVAXConfig] = None,
143
+ ):
144
+ super().__init__()
145
+ # TODO (joao): remove the `if` below, only used for BC
146
+ self.rope_kwargs = {}
147
+ if config is None:
148
+ logger.warning_once(
149
+ "`HyperCLOVAXRotaryEmbedding` can now be fully parameterized by passing the model config through the "
150
+ "`config` argument. All other arguments will be removed in v4.46"
151
+ )
152
+ self.rope_kwargs = {
153
+ "rope_type": rope_type,
154
+ "factor": scaling_factor,
155
+ "dim": dim,
156
+ "base": base,
157
+ "max_position_embeddings": max_position_embeddings,
158
+ }
159
+ self.rope_type = rope_type
160
+ self.max_seq_len_cached = max_position_embeddings
161
+ self.original_max_seq_len = max_position_embeddings
162
+ else:
163
+ # BC: "rope_type" was originally "type"
164
+ if config.rope_scaling is not None:
165
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
166
+ else:
167
+ self.rope_type = "default"
168
+ self.max_seq_len_cached = config.max_position_embeddings
169
+ self.original_max_seq_len = config.max_position_embeddings
170
+
171
+ self.config = config
172
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
173
+
174
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
175
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
176
+ self.original_inv_freq = self.inv_freq
177
+
178
+ def _dynamic_frequency_update(self, position_ids, device):
179
+ """
180
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
181
+ 1 - growing beyond the cached sequence length (allow scaling)
182
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
183
+ """
184
+ seq_len = torch.max(position_ids) + 1
185
+ if seq_len > self.max_seq_len_cached: # growth
186
+ inv_freq, self.attention_scaling = self.rope_init_fn(
187
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
188
+ )
189
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
190
+ self.max_seq_len_cached = seq_len
191
+
192
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
193
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
194
+ self.max_seq_len_cached = self.original_max_seq_len
195
+
196
+ @torch.no_grad()
197
+ def forward(self, x, position_ids):
198
+ if "dynamic" in self.rope_type:
199
+ self._dynamic_frequency_update(position_ids, device=x.device)
200
+
201
+ # Core RoPE block
202
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
203
+ position_ids_expanded = position_ids[:, None, :].float()
204
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
205
+ device_type = x.device.type
206
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
207
+ with torch.autocast(device_type=device_type, enabled=False):
208
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
209
+ emb = torch.cat((freqs, freqs), dim=-1)
210
+ cos = emb.cos()
211
+ sin = emb.sin()
212
+
213
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
214
+ cos = cos * self.attention_scaling
215
+ sin = sin * self.attention_scaling
216
+
217
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
218
+
219
+
220
+ class HyperCLOVAXLinearScalingRotaryEmbedding(HyperCLOVAXRotaryEmbedding):
221
+ """HyperCLOVAXRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
222
+
223
+ def __init__(self, *args, **kwargs):
224
+ logger.warning_once(
225
+ "`HyperCLOVAXLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
226
+ "`HyperCLOVAXRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
227
+ )
228
+ kwargs["rope_type"] = "linear"
229
+ super().__init__(*args, **kwargs)
230
+
231
+
232
+ class HyperCLOVAXDynamicNTKScalingRotaryEmbedding(HyperCLOVAXRotaryEmbedding):
233
+ """HyperCLOVAXRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
234
+
235
+ def __init__(self, *args, **kwargs):
236
+ logger.warning_once(
237
+ "`HyperCLOVAXDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
238
+ "`HyperCLOVAXRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
239
+ "__init__)."
240
+ )
241
+ kwargs["rope_type"] = "dynamic"
242
+ super().__init__(*args, **kwargs)
243
+
244
+
245
+ def rotate_half(x):
246
+ """Rotates half the hidden dims of the input."""
247
+ x1 = x[..., : x.shape[-1] // 2]
248
+ x2 = x[..., x.shape[-1] // 2 :]
249
+ return torch.cat((-x2, x1), dim=-1)
250
+
251
+
252
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
253
+ """Applies Rotary Position Embedding to the query and key tensors.
254
+
255
+ Args:
256
+ q (`torch.Tensor`): The query tensor.
257
+ k (`torch.Tensor`): The key tensor.
258
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
259
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
260
+ position_ids (`torch.Tensor`, *optional*):
261
+ Deprecated and unused.
262
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
263
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
264
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
265
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
266
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
267
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
268
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
269
+ Returns:
270
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
271
+ """
272
+ cos = cos.unsqueeze(unsqueeze_dim)
273
+ sin = sin.unsqueeze(unsqueeze_dim)
274
+ q_embed = (q * cos) + (rotate_half(q) * sin)
275
+ k_embed = (k * cos) + (rotate_half(k) * sin)
276
+ return q_embed, k_embed
277
+
278
+
279
+ class HyperCLOVAXMLP(nn.Module):
280
+ def __init__(self, config):
281
+ super().__init__()
282
+ self.config = config
283
+ self.hidden_size = config.hidden_size
284
+ self.intermediate_size = config.intermediate_size
285
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
286
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
287
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
288
+ self.act_fn = ACT2FN[config.hidden_act]
289
+
290
+ def forward(self, x):
291
+ if self.config.pretraining_tp > 1:
292
+ slice = self.intermediate_size // self.config.pretraining_tp
293
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
294
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
295
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
296
+
297
+ gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
298
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
299
+
300
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
301
+ down_proj = [
302
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
303
+ ]
304
+ down_proj = sum(down_proj)
305
+ else:
306
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
307
+
308
+ return down_proj
309
+
310
+
311
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
312
+ """
313
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
314
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
315
+ """
316
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
317
+ if n_rep == 1:
318
+ return hidden_states
319
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
320
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
321
+
322
+
323
+ class HyperCLOVAXAttention(nn.Module):
324
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
325
+
326
+ def __init__(self, config: HyperCLOVAXConfig, layer_idx: Optional[int] = None):
327
+ super().__init__()
328
+ self.config = config
329
+ self.layer_idx = layer_idx
330
+ if layer_idx is None:
331
+ logger.warning_once(
332
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
333
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
334
+ "when creating this class."
335
+ )
336
+
337
+ self.attention_dropout = config.attention_dropout
338
+ self.hidden_size = config.hidden_size
339
+ self.num_heads = config.num_attention_heads
340
+ self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
341
+ self.num_key_value_heads = config.num_key_value_heads
342
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
343
+ self.max_position_embeddings = config.max_position_embeddings
344
+ self.rope_theta = config.rope_theta
345
+ self.is_causal = True
346
+
347
+ self.scaling = config.attention_multiplier
348
+
349
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
350
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
351
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
352
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
353
+
354
+ # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
355
+ self.rotary_emb = HyperCLOVAXRotaryEmbedding(config=self.config)
356
+
357
+ def forward(
358
+ self,
359
+ hidden_states: torch.Tensor,
360
+ attention_mask: Optional[torch.Tensor] = None,
361
+ position_ids: Optional[torch.LongTensor] = None,
362
+ past_key_value: Optional[Cache] = None,
363
+ output_attentions: bool = False,
364
+ use_cache: bool = False,
365
+ cache_position: Optional[torch.LongTensor] = None,
366
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
367
+ **kwargs,
368
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
369
+ bsz, q_len, _ = hidden_states.size()
370
+
371
+ if self.config.pretraining_tp > 1:
372
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
373
+ query_slices = self.q_proj.weight.split(
374
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
375
+ )
376
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
377
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
378
+
379
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
380
+ query_states = torch.cat(query_states, dim=-1)
381
+
382
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
383
+ key_states = torch.cat(key_states, dim=-1)
384
+
385
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
386
+ value_states = torch.cat(value_states, dim=-1)
387
+
388
+ else:
389
+ query_states = self.q_proj(hidden_states)
390
+ key_states = self.k_proj(hidden_states)
391
+ value_states = self.v_proj(hidden_states)
392
+
393
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
394
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
395
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
396
+
397
+ if position_embeddings is None:
398
+ logger.warning_once(
399
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
400
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
401
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
402
+ "removed and `position_embeddings` will be mandatory."
403
+ )
404
+ cos, sin = self.rotary_emb(value_states, position_ids)
405
+ else:
406
+ cos, sin = position_embeddings
407
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
408
+
409
+ if past_key_value is not None:
410
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
411
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
412
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
413
+
414
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
415
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
416
+ # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling / math.sqrt(self.head_dim)
417
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
418
+
419
+ if attention_mask is not None: # no matter the length, we just slice it
420
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
421
+ attn_weights = attn_weights + causal_mask
422
+
423
+ # upcast attention to fp32
424
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
425
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
426
+ attn_output = torch.matmul(attn_weights, value_states)
427
+
428
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
429
+ raise ValueError(
430
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
431
+ f" {attn_output.size()}"
432
+ )
433
+
434
+ attn_output = attn_output.transpose(1, 2).contiguous()
435
+
436
+ attn_output = attn_output.reshape(bsz, q_len, -1)
437
+
438
+ if self.config.pretraining_tp > 1:
439
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
440
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
441
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
442
+ else:
443
+ attn_output = self.o_proj(attn_output)
444
+
445
+ if not output_attentions:
446
+ attn_weights = None
447
+
448
+ return attn_output, attn_weights, past_key_value
449
+
450
+
451
+ class HyperCLOVAXFlashAttention2(HyperCLOVAXAttention):
452
+ """
453
+ HyperCLOVAX flash attention module. This module inherits from `HyperCLOVAXAttention` as the weights of the module stays
454
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
455
+ flash attention and deal with padding tokens in case the input contains any of them.
456
+ """
457
+
458
+ def __init__(self, *args, **kwargs):
459
+ super().__init__(*args, **kwargs)
460
+
461
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
462
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
463
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
464
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
465
+
466
+ def forward(
467
+ self,
468
+ hidden_states: torch.Tensor,
469
+ attention_mask: Optional[torch.LongTensor] = None,
470
+ position_ids: Optional[torch.LongTensor] = None,
471
+ past_key_value: Optional[Cache] = None,
472
+ output_attentions: bool = False,
473
+ use_cache: bool = False,
474
+ cache_position: Optional[torch.LongTensor] = None,
475
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
476
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
477
+ if isinstance(past_key_value, StaticCache):
478
+ raise ValueError(
479
+ "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
480
+ "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
481
+ )
482
+
483
+ output_attentions = False
484
+
485
+ bsz, q_len, _ = hidden_states.size()
486
+
487
+ query_states = self.q_proj(hidden_states)
488
+ key_states = self.k_proj(hidden_states)
489
+ value_states = self.v_proj(hidden_states)
490
+
491
+ # Flash attention requires the input to have the shape
492
+ # batch_size x seq_length x head_dim x hidden_dim
493
+ # therefore we just need to keep the original shape
494
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
495
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
496
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
497
+
498
+ if position_embeddings is None:
499
+ logger.warning_once(
500
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
501
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
502
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
503
+ "removed and `position_embeddings` will be mandatory."
504
+ )
505
+ cos, sin = self.rotary_emb(value_states, position_ids)
506
+ else:
507
+ cos, sin = position_embeddings
508
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
509
+
510
+ if past_key_value is not None:
511
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
512
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
513
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
514
+
515
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
516
+ # to be able to avoid many of these transpose/reshape/view.
517
+ query_states = query_states.transpose(1, 2)
518
+ key_states = key_states.transpose(1, 2)
519
+ value_states = value_states.transpose(1, 2)
520
+
521
+ dropout_rate = self.attention_dropout if self.training else 0.0
522
+
523
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
524
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
525
+ # cast them back in the correct dtype just to be sure everything works as expected.
526
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
527
+ # in fp32. (HyperCLOVAXRMSNorm handles it correctly)
528
+
529
+ input_dtype = query_states.dtype
530
+ if input_dtype == torch.float32:
531
+ if torch.is_autocast_enabled():
532
+ target_dtype = torch.get_autocast_gpu_dtype()
533
+ # Handle the case where the model is quantized
534
+ elif hasattr(self.config, "_pre_quantization_dtype"):
535
+ target_dtype = self.config._pre_quantization_dtype
536
+ else:
537
+ target_dtype = self.q_proj.weight.dtype
538
+
539
+ logger.warning_once(
540
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
541
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
542
+ f" {target_dtype}."
543
+ )
544
+
545
+ query_states = query_states.to(target_dtype)
546
+ key_states = key_states.to(target_dtype)
547
+ value_states = value_states.to(target_dtype)
548
+
549
+ attn_output = _flash_attention_forward(
550
+ query_states,
551
+ key_states,
552
+ value_states,
553
+ attention_mask,
554
+ q_len,
555
+ position_ids=position_ids,
556
+ dropout=dropout_rate,
557
+ softmax_scale=self.scaling, # mup
558
+ sliding_window=getattr(self, "sliding_window", None),
559
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
560
+ is_causal=self.is_causal,
561
+ )
562
+
563
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
564
+ attn_output = self.o_proj(attn_output)
565
+
566
+ if not output_attentions:
567
+ attn_weights = None
568
+
569
+ return attn_output, attn_weights, past_key_value
570
+
571
+
572
+ class HyperCLOVAXSdpaAttention(HyperCLOVAXAttention):
573
+ """
574
+ HyperCLOVAX attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
575
+ `HyperCLOVAXAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
576
+ SDPA API.
577
+ """
578
+
579
+ # Adapted from HyperCLOVAXAttention.forward
580
+ def forward(
581
+ self,
582
+ hidden_states: torch.Tensor,
583
+ attention_mask: Optional[torch.Tensor] = None,
584
+ position_ids: Optional[torch.LongTensor] = None,
585
+ past_key_value: Optional[Cache] = None,
586
+ output_attentions: bool = False,
587
+ use_cache: bool = False,
588
+ cache_position: Optional[torch.LongTensor] = None,
589
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
590
+ **kwargs,
591
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
592
+ if output_attentions:
593
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
594
+ logger.warning_once(
595
+ "HyperCLOVAXModel is using HyperCLOVAXSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
596
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
597
+ )
598
+ return super().forward(
599
+ hidden_states=hidden_states,
600
+ attention_mask=attention_mask,
601
+ position_ids=position_ids,
602
+ past_key_value=past_key_value,
603
+ output_attentions=output_attentions,
604
+ use_cache=use_cache,
605
+ cache_position=cache_position,
606
+ position_embeddings=position_embeddings,
607
+ )
608
+
609
+ bsz, q_len, _ = hidden_states.size()
610
+
611
+ query_states = self.q_proj(hidden_states)
612
+ key_states = self.k_proj(hidden_states)
613
+ value_states = self.v_proj(hidden_states)
614
+
615
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
616
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
617
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
618
+
619
+ if position_embeddings is None:
620
+ logger.warning_once(
621
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
622
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
623
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
624
+ "removed and `position_embeddings` will be mandatory."
625
+ )
626
+ cos, sin = self.rotary_emb(value_states, position_ids)
627
+ else:
628
+ cos, sin = position_embeddings
629
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
630
+
631
+ if past_key_value is not None:
632
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
633
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
634
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
635
+
636
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
637
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
638
+
639
+ causal_mask = attention_mask
640
+ if attention_mask is not None:
641
+ causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
642
+
643
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
644
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
645
+ if query_states.device.type == "cuda" and causal_mask is not None:
646
+ query_states = query_states.contiguous()
647
+ key_states = key_states.contiguous()
648
+ value_states = value_states.contiguous()
649
+
650
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
651
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
652
+ is_causal = True if causal_mask is None and q_len > 1 else False
653
+
654
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
655
+ query_states,
656
+ key_states,
657
+ value_states,
658
+ attn_mask=causal_mask,
659
+ dropout_p=self.attention_dropout if self.training else 0.0,
660
+ is_causal=is_causal,
661
+ scale=self.scaling, # mup
662
+ )
663
+
664
+ attn_output = attn_output.transpose(1, 2).contiguous()
665
+ attn_output = attn_output.view(bsz, q_len, -1)
666
+
667
+ attn_output = self.o_proj(attn_output)
668
+
669
+ return attn_output, None, past_key_value
670
+
671
+
672
+ HyperCLOVAX_ATTENTION_CLASSES = {
673
+ "eager": HyperCLOVAXAttention,
674
+ "flash_attention_2": HyperCLOVAXFlashAttention2,
675
+ "sdpa": HyperCLOVAXSdpaAttention,
676
+ }
677
+
678
+
679
+ class HyperCLOVAXDecoderLayer(nn.Module):
680
+ def __init__(self, config: HyperCLOVAXConfig, layer_idx: int):
681
+ super().__init__()
682
+ self.hidden_size = config.hidden_size
683
+
684
+ self.self_attn = HyperCLOVAX_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
685
+
686
+ self.mlp = HyperCLOVAXMLP(config)
687
+ self.input_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
688
+ self.post_attention_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
689
+
690
+ # post-norm (dual-norm)
691
+ self.use_post_norm = config.use_post_norm
692
+ if self.use_post_norm:
693
+ self.post_norm1 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
694
+ self.post_norm2 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
695
+
696
+ self.residual_multiplier = config.residual_multiplier # mup
697
+
698
+ def forward(
699
+ self,
700
+ hidden_states: torch.Tensor,
701
+ attention_mask: Optional[torch.Tensor] = None,
702
+ position_ids: Optional[torch.LongTensor] = None,
703
+ past_key_value: Optional[Cache] = None,
704
+ output_attentions: Optional[bool] = False,
705
+ use_cache: Optional[bool] = False,
706
+ cache_position: Optional[torch.LongTensor] = None,
707
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
708
+ **kwargs,
709
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
710
+ """
711
+ Args:
712
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
713
+ attention_mask (`torch.FloatTensor`, *optional*):
714
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
715
+ query_sequence_length, key_sequence_length)` if default attention is used.
716
+ output_attentions (`bool`, *optional*):
717
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
718
+ returned tensors for more detail.
719
+ use_cache (`bool`, *optional*):
720
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
721
+ (see `past_key_values`).
722
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
723
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
724
+ Indices depicting the position of the input sequence tokens in the sequence
725
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
726
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
727
+ with `head_dim` being the embedding dimension of each attention head.
728
+ kwargs (`dict`, *optional*):
729
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
730
+ into the model
731
+ """
732
+ residual = hidden_states
733
+
734
+ hidden_states = self.input_layernorm(hidden_states)
735
+
736
+ # Self Attention
737
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
738
+ hidden_states=hidden_states,
739
+ attention_mask=attention_mask,
740
+ position_ids=position_ids,
741
+ past_key_value=past_key_value,
742
+ output_attentions=output_attentions,
743
+ use_cache=use_cache,
744
+ cache_position=cache_position,
745
+ position_embeddings=position_embeddings,
746
+ **kwargs,
747
+ )
748
+
749
+ if self.use_post_norm:
750
+ hidden_states = self.post_norm1(hidden_states)
751
+
752
+ hidden_states = residual + hidden_states * self.residual_multiplier # mup
753
+
754
+ # Fully Connected
755
+ residual = hidden_states
756
+ hidden_states = self.post_attention_layernorm(hidden_states)
757
+ hidden_states = self.mlp(hidden_states)
758
+
759
+ if self.use_post_norm:
760
+ hidden_states = self.post_norm2(hidden_states)
761
+
762
+ hidden_states = residual + hidden_states * self.residual_multiplier # mup
763
+
764
+ outputs = (hidden_states,)
765
+
766
+ if output_attentions:
767
+ outputs += (self_attn_weights,)
768
+
769
+ if use_cache:
770
+ outputs += (present_key_value,)
771
+
772
+ return outputs
773
+
774
+
775
+ HyperCLOVAX_START_DOCSTRING = r"""
776
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
777
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
778
+ etc.)
779
+
780
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
781
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
782
+ and behavior.
783
+
784
+ Parameters:
785
+ config ([`HyperCLOVAXConfig`]):
786
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
787
+ load the weights associated with the model, only the configuration. Check out the
788
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
789
+ """
790
+
791
+
792
+ @add_start_docstrings(
793
+ "The bare HyperCLOVAX Model outputting raw hidden-states without any specific head on top.",
794
+ HyperCLOVAX_START_DOCSTRING,
795
+ )
796
+ class HyperCLOVAXPreTrainedModel(PreTrainedModel):
797
+ config_class = HyperCLOVAXConfig
798
+ base_model_prefix = "model"
799
+ supports_gradient_checkpointing = True
800
+ _no_split_modules = ["HyperCLOVAXDecoderLayer"]
801
+ _skip_keys_device_placement = ["past_key_values"]
802
+ _supports_flash_attn_2 = True
803
+ _supports_sdpa = True
804
+ _supports_cache_class = True
805
+ _supports_quantized_cache = True
806
+ _supports_static_cache = True
807
+
808
+ def _init_weights(self, module):
809
+ std = self.config.initializer_range
810
+ if isinstance(module, nn.Linear):
811
+ module.weight.data.normal_(mean=0.0, std=std)
812
+ if module.bias is not None:
813
+ module.bias.data.zero_()
814
+ elif isinstance(module, nn.Embedding):
815
+ module.weight.data.normal_(mean=0.0, std=std)
816
+ if module.padding_idx is not None:
817
+ module.weight.data[module.padding_idx].zero_()
818
+
819
+
820
+ HyperCLOVAX_INPUTS_DOCSTRING = r"""
821
+ Args:
822
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
823
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
824
+ it.
825
+
826
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
827
+ [`PreTrainedTokenizer.__call__`] for details.
828
+
829
+ [What are input IDs?](../glossary#input-ids)
830
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
831
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
832
+
833
+ - 1 for tokens that are **not masked**,
834
+ - 0 for tokens that are **masked**.
835
+
836
+ [What are attention masks?](../glossary#attention-mask)
837
+
838
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
839
+ [`PreTrainedTokenizer.__call__`] for details.
840
+
841
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
842
+ `past_key_values`).
843
+
844
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
845
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
846
+ information on the default strategy.
847
+
848
+ - 1 indicates the head is **not masked**,
849
+ - 0 indicates the head is **masked**.
850
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
851
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
852
+ config.n_positions - 1]`.
853
+
854
+ [What are position IDs?](../glossary#position-ids)
855
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
856
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
857
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
858
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
859
+
860
+ Two formats are allowed:
861
+ - a [`~cache_utils.Cache`] instance, see our
862
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
863
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
864
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
865
+ cache format.
866
+
867
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
868
+ legacy cache format will be returned.
869
+
870
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
871
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
872
+ of shape `(batch_size, sequence_length)`.
873
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
874
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
875
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
876
+ model's internal embedding lookup matrix.
877
+ use_cache (`bool`, *optional*):
878
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
879
+ `past_key_values`).
880
+ output_attentions (`bool`, *optional*):
881
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
882
+ tensors for more detail.
883
+ output_hidden_states (`bool`, *optional*):
884
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
885
+ more detail.
886
+ return_dict (`bool`, *optional*):
887
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
888
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
889
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
890
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
891
+ the complete sequence length.
892
+ """
893
+
894
+
895
+ @add_start_docstrings(
896
+ "The bare HyperCLOVAX Model outputting raw hidden-states without any specific head on top.",
897
+ HyperCLOVAX_START_DOCSTRING,
898
+ )
899
+ class HyperCLOVAXModel(HyperCLOVAXPreTrainedModel):
900
+ """
901
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`HyperCLOVAXDecoderLayer`]
902
+
903
+ Args:
904
+ config: HyperCLOVAXConfig
905
+ """
906
+
907
+ def __init__(self, config: HyperCLOVAXConfig):
908
+ super().__init__(config)
909
+ self.padding_idx = config.pad_token_id
910
+ self.vocab_size = config.vocab_size
911
+
912
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
913
+ self.layers = nn.ModuleList(
914
+ [HyperCLOVAXDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
915
+ )
916
+ self.norm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
917
+ self.rotary_emb = HyperCLOVAXRotaryEmbedding(config=config)
918
+ self.gradient_checkpointing = False
919
+
920
+ # Initialize weights and apply final processing
921
+ self.post_init()
922
+
923
+ # mup
924
+ self.embedding_multiplier = config.embedding_multiplier
925
+
926
+ def get_input_embeddings(self):
927
+ return self.embed_tokens
928
+
929
+ def set_input_embeddings(self, value):
930
+ self.embed_tokens = value
931
+
932
+ @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
933
+ def forward(
934
+ self,
935
+ input_ids: torch.LongTensor = None,
936
+ attention_mask: Optional[torch.Tensor] = None,
937
+ position_ids: Optional[torch.LongTensor] = None,
938
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
939
+ inputs_embeds: Optional[torch.FloatTensor] = None,
940
+ use_cache: Optional[bool] = None,
941
+ output_attentions: Optional[bool] = None,
942
+ output_hidden_states: Optional[bool] = None,
943
+ return_dict: Optional[bool] = None,
944
+ cache_position: Optional[torch.LongTensor] = None,
945
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
946
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
947
+ output_hidden_states = (
948
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
949
+ )
950
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
951
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
952
+
953
+ if (input_ids is None) ^ (inputs_embeds is not None):
954
+ raise ValueError(
955
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
956
+ )
957
+
958
+ if self.gradient_checkpointing and self.training and use_cache:
959
+ logger.warning_once(
960
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
961
+ )
962
+ use_cache = False
963
+
964
+ if inputs_embeds is None:
965
+ inputs_embeds = self.embed_tokens(input_ids)
966
+
967
+ inputs_embeds = inputs_embeds * self.embedding_multiplier # mup
968
+
969
+ # kept for BC (non `Cache` `past_key_values` inputs)
970
+ return_legacy_cache = False
971
+ if use_cache and not isinstance(past_key_values, Cache):
972
+ return_legacy_cache = True
973
+ if past_key_values is None:
974
+ past_key_values = DynamicCache()
975
+ else:
976
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
977
+ logger.warning_once(
978
+ "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
979
+ "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
980
+ "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
981
+ )
982
+
983
+ if cache_position is None:
984
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
985
+ cache_position = torch.arange(
986
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
987
+ )
988
+ if position_ids is None:
989
+ position_ids = cache_position.unsqueeze(0)
990
+
991
+ causal_mask = self._update_causal_mask(
992
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
993
+ )
994
+ hidden_states = inputs_embeds
995
+
996
+ # create position embeddings to be shared across the decoder layers
997
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
998
+
999
+ # decoder layers
1000
+ all_hidden_states = () if output_hidden_states else None
1001
+ all_self_attns = () if output_attentions else None
1002
+ next_decoder_cache = None
1003
+
1004
+ for decoder_layer in self.layers:
1005
+ if output_hidden_states:
1006
+ all_hidden_states += (hidden_states,)
1007
+
1008
+ if self.gradient_checkpointing and self.training:
1009
+ layer_outputs = self._gradient_checkpointing_func(
1010
+ decoder_layer.__call__,
1011
+ hidden_states,
1012
+ causal_mask,
1013
+ position_ids,
1014
+ past_key_values,
1015
+ output_attentions,
1016
+ use_cache,
1017
+ cache_position,
1018
+ position_embeddings,
1019
+ )
1020
+ else:
1021
+ layer_outputs = decoder_layer(
1022
+ hidden_states,
1023
+ attention_mask=causal_mask,
1024
+ position_ids=position_ids,
1025
+ past_key_value=past_key_values,
1026
+ output_attentions=output_attentions,
1027
+ use_cache=use_cache,
1028
+ cache_position=cache_position,
1029
+ position_embeddings=position_embeddings,
1030
+ )
1031
+
1032
+ hidden_states = layer_outputs[0]
1033
+
1034
+ if use_cache:
1035
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1036
+
1037
+ if output_attentions:
1038
+ all_self_attns += (layer_outputs[1],)
1039
+
1040
+ hidden_states = self.norm(hidden_states)
1041
+
1042
+ # add hidden states from the last decoder layer
1043
+ if output_hidden_states:
1044
+ all_hidden_states += (hidden_states,)
1045
+
1046
+ next_cache = next_decoder_cache if use_cache else None
1047
+ if return_legacy_cache:
1048
+ next_cache = next_cache.to_legacy_cache()
1049
+
1050
+ if not return_dict:
1051
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1052
+ return BaseModelOutputWithPast(
1053
+ last_hidden_state=hidden_states,
1054
+ past_key_values=next_cache,
1055
+ hidden_states=all_hidden_states,
1056
+ attentions=all_self_attns,
1057
+ )
1058
+
1059
+ def _update_causal_mask(
1060
+ self,
1061
+ attention_mask: torch.Tensor,
1062
+ input_tensor: torch.Tensor,
1063
+ cache_position: torch.Tensor,
1064
+ past_key_values: Cache,
1065
+ output_attentions: bool,
1066
+ ):
1067
+ if self.config._attn_implementation == "flash_attention_2":
1068
+ if attention_mask is not None and 0.0 in attention_mask:
1069
+ return attention_mask
1070
+ return None
1071
+
1072
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
1073
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
1074
+ # to infer the attention mask.
1075
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
1076
+ using_static_cache = isinstance(past_key_values, StaticCache)
1077
+
1078
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
1079
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
1080
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
1081
+ attention_mask,
1082
+ inputs_embeds=input_tensor,
1083
+ past_key_values_length=past_seen_tokens,
1084
+ is_training=self.training,
1085
+ ):
1086
+ return None
1087
+
1088
+ dtype, device = input_tensor.dtype, input_tensor.device
1089
+ min_dtype = torch.finfo(dtype).min
1090
+ sequence_length = input_tensor.shape[1]
1091
+ if using_static_cache:
1092
+ target_length = past_key_values.get_max_length()
1093
+ else:
1094
+ target_length = (
1095
+ attention_mask.shape[-1]
1096
+ if isinstance(attention_mask, torch.Tensor)
1097
+ else past_seen_tokens + sequence_length + 1
1098
+ )
1099
+
1100
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1101
+ causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
1102
+ attention_mask,
1103
+ sequence_length=sequence_length,
1104
+ target_length=target_length,
1105
+ dtype=dtype,
1106
+ device=device,
1107
+ min_dtype=min_dtype,
1108
+ cache_position=cache_position,
1109
+ batch_size=input_tensor.shape[0],
1110
+ )
1111
+
1112
+ if (
1113
+ self.config._attn_implementation == "sdpa"
1114
+ and attention_mask is not None
1115
+ and attention_mask.device.type == "cuda"
1116
+ and not output_attentions
1117
+ ):
1118
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1119
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1120
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1121
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1122
+
1123
+ return causal_mask
1124
+
1125
+
1126
+ class HyperCLOVAXForCausalLM(HyperCLOVAXPreTrainedModel, GenerationMixin):
1127
+ _tied_weights_keys = ["lm_head.weight"]
1128
+
1129
+ def __init__(self, config):
1130
+ super().__init__(config)
1131
+ self.model = HyperCLOVAXModel(config)
1132
+ self.vocab_size = config.vocab_size
1133
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1134
+
1135
+ # Initialize weights and apply final processing
1136
+ self.post_init()
1137
+
1138
+ def _get_apply_liger_kernel_converter(self):
1139
+ return _apply_liger_kernel_to_instance
1140
+
1141
+ def get_input_embeddings(self):
1142
+ return self.model.embed_tokens
1143
+
1144
+ def set_input_embeddings(self, value):
1145
+ self.model.embed_tokens = value
1146
+
1147
+ def get_output_embeddings(self):
1148
+ return self.lm_head
1149
+
1150
+ def set_output_embeddings(self, new_embeddings):
1151
+ self.lm_head = new_embeddings
1152
+
1153
+ def set_decoder(self, decoder):
1154
+ self.model = decoder
1155
+
1156
+ def get_decoder(self):
1157
+ return self.model
1158
+
1159
+ @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
1160
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1161
+ def forward(
1162
+ self,
1163
+ input_ids: torch.LongTensor = None,
1164
+ attention_mask: Optional[torch.Tensor] = None,
1165
+ position_ids: Optional[torch.LongTensor] = None,
1166
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1167
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1168
+ labels: Optional[torch.LongTensor] = None,
1169
+ use_cache: Optional[bool] = None,
1170
+ output_attentions: Optional[bool] = None,
1171
+ output_hidden_states: Optional[bool] = None,
1172
+ return_dict: Optional[bool] = None,
1173
+ cache_position: Optional[torch.LongTensor] = None,
1174
+ num_logits_to_keep: int = 0,
1175
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1176
+ r"""
1177
+ Args:
1178
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1179
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1180
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1181
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1182
+
1183
+ num_logits_to_keep (`int`, *optional*):
1184
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1185
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1186
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1187
+
1188
+ Returns:
1189
+
1190
+ Example:
1191
+
1192
+ ```python
1193
+ >>> from transformers import AutoTokenizer, HyperCLOVAXForCausalLM
1194
+
1195
+ >>> model = HyperCLOVAXForCausalLM.from_pretrained(YOUR_DIR)
1196
+ >>> tokenizer = AutoTokenizer.from_pretrained(YOUR_DIR)
1197
+
1198
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1199
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1200
+
1201
+ >>> # Generate
1202
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1203
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1204
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1205
+ ```"""
1206
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1207
+ output_hidden_states = (
1208
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1209
+ )
1210
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1211
+
1212
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1213
+ outputs = self.model(
1214
+ input_ids=input_ids,
1215
+ attention_mask=attention_mask,
1216
+ position_ids=position_ids,
1217
+ past_key_values=past_key_values,
1218
+ inputs_embeds=inputs_embeds,
1219
+ use_cache=use_cache,
1220
+ output_attentions=output_attentions,
1221
+ output_hidden_states=output_hidden_states,
1222
+ return_dict=return_dict,
1223
+ cache_position=cache_position,
1224
+ )
1225
+
1226
+ hidden_states = outputs[0]
1227
+ if self.config.pretraining_tp > 1:
1228
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
1229
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
1230
+ logits = torch.cat(logits, dim=-1)
1231
+ else:
1232
+ if labels is None and not is_torchdynamo_compiling():
1233
+ logger.warning_once(
1234
+ "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
1235
+ )
1236
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1237
+ # TODO: remove the float() operation in v4.46
1238
+ logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
1239
+
1240
+ logits = logits * self.config.logits_scaling # mup
1241
+
1242
+ loss = None
1243
+ if labels is not None:
1244
+ # Upcast to float if we need to compute the loss to avoid potential precision issues
1245
+ logits = logits.float()
1246
+ # Shift so that tokens < n predict n
1247
+ shift_logits = logits[..., :-1, :].contiguous()
1248
+ shift_labels = labels[..., 1:].contiguous()
1249
+ # Flatten the tokens
1250
+ loss_fct = CrossEntropyLoss()
1251
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1252
+ shift_labels = shift_labels.view(-1)
1253
+ # Enable model parallelism
1254
+ shift_labels = shift_labels.to(shift_logits.device)
1255
+ loss = loss_fct(shift_logits, shift_labels)
1256
+
1257
+ if not return_dict:
1258
+ output = (logits,) + outputs[1:]
1259
+ return (loss,) + output if loss is not None else output
1260
+
1261
+ return CausalLMOutputWithPast(
1262
+ loss=loss,
1263
+ logits=logits,
1264
+ past_key_values=outputs.past_key_values,
1265
+ hidden_states=outputs.hidden_states,
1266
+ attentions=outputs.attentions,
1267
+ )
1268
+
1269
+ def prepare_inputs_for_generation(
1270
+ self,
1271
+ input_ids,
1272
+ past_key_values=None,
1273
+ attention_mask=None,
1274
+ inputs_embeds=None,
1275
+ cache_position=None,
1276
+ position_ids=None,
1277
+ use_cache=True,
1278
+ num_logits_to_keep=None,
1279
+ **kwargs,
1280
+ ):
1281
+ # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
1282
+ # Exception 1: when passing input_embeds, input_ids may be missing entries
1283
+ # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
1284
+ if past_key_values is not None:
1285
+ if inputs_embeds is not None: # Exception 1
1286
+ input_ids = input_ids[:, -cache_position.shape[0] :]
1287
+ elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
1288
+ input_ids = input_ids[:, cache_position]
1289
+
1290
+ if attention_mask is not None and position_ids is None:
1291
+ # create position_ids on the fly for batch generation
1292
+ position_ids = attention_mask.long().cumsum(-1) - 1
1293
+ position_ids.masked_fill_(attention_mask == 0, 1)
1294
+ if past_key_values:
1295
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1296
+
1297
+ # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
1298
+ position_ids = position_ids.clone(memory_format=torch.contiguous_format)
1299
+
1300
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1301
+ if inputs_embeds is not None and cache_position[0] == 0:
1302
+ model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
1303
+ else:
1304
+ # The clone here is for the same reason as for `position_ids`.
1305
+ model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
1306
+
1307
+ if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
1308
+ if model_inputs["inputs_embeds"] is not None:
1309
+ batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
1310
+ device = model_inputs["inputs_embeds"].device
1311
+ else:
1312
+ batch_size, sequence_length = model_inputs["input_ids"].shape
1313
+ device = model_inputs["input_ids"].device
1314
+
1315
+ dtype = self.lm_head.weight.dtype
1316
+ min_dtype = torch.finfo(dtype).min
1317
+
1318
+ attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
1319
+ attention_mask,
1320
+ sequence_length=sequence_length,
1321
+ target_length=past_key_values.get_max_length(),
1322
+ dtype=dtype,
1323
+ device=device,
1324
+ min_dtype=min_dtype,
1325
+ cache_position=cache_position,
1326
+ batch_size=batch_size,
1327
+ )
1328
+
1329
+ if num_logits_to_keep is not None:
1330
+ model_inputs["num_logits_to_keep"] = num_logits_to_keep
1331
+
1332
+ model_inputs.update(
1333
+ {
1334
+ "position_ids": position_ids,
1335
+ "cache_position": cache_position,
1336
+ "past_key_values": past_key_values,
1337
+ "use_cache": use_cache,
1338
+ "attention_mask": attention_mask,
1339
+ }
1340
+ )
1341
+ return model_inputs
1342
+
1343
+
1344
+ @add_start_docstrings(
1345
+ """
1346
+ The HyperCLOVAX Model transformer with a sequence classification head on top (linear layer).
1347
+
1348
+ [`HyperCLOVAXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1349
+ (e.g. GPT-2) do.
1350
+
1351
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1352
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1353
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1354
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1355
+ each row of the batch).
1356
+ """,
1357
+ HyperCLOVAX_START_DOCSTRING,
1358
+ )
1359
+ class HyperCLOVAXForSequenceClassification(HyperCLOVAXPreTrainedModel):
1360
+ def __init__(self, config):
1361
+ super().__init__(config)
1362
+ self.num_labels = config.num_labels
1363
+ self.model = HyperCLOVAXModel(config)
1364
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1365
+
1366
+ # Initialize weights and apply final processing
1367
+ self.post_init()
1368
+
1369
+ def get_input_embeddings(self):
1370
+ return self.model.embed_tokens
1371
+
1372
+ def set_input_embeddings(self, value):
1373
+ self.model.embed_tokens = value
1374
+
1375
+ @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
1376
+ def forward(
1377
+ self,
1378
+ input_ids: Optional[torch.LongTensor] = None,
1379
+ attention_mask: Optional[torch.Tensor] = None,
1380
+ position_ids: Optional[torch.LongTensor] = None,
1381
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1382
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1383
+ labels: Optional[torch.LongTensor] = None,
1384
+ use_cache: Optional[bool] = None,
1385
+ output_attentions: Optional[bool] = None,
1386
+ output_hidden_states: Optional[bool] = None,
1387
+ return_dict: Optional[bool] = None,
1388
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1389
+ r"""
1390
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1391
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1392
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1393
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1394
+ """
1395
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1396
+
1397
+ transformer_outputs = self.model(
1398
+ input_ids,
1399
+ attention_mask=attention_mask,
1400
+ position_ids=position_ids,
1401
+ past_key_values=past_key_values,
1402
+ inputs_embeds=inputs_embeds,
1403
+ use_cache=use_cache,
1404
+ output_attentions=output_attentions,
1405
+ output_hidden_states=output_hidden_states,
1406
+ return_dict=return_dict,
1407
+ )
1408
+ hidden_states = transformer_outputs[0]
1409
+ logits = self.score(hidden_states)
1410
+
1411
+ if input_ids is not None:
1412
+ batch_size = input_ids.shape[0]
1413
+ else:
1414
+ batch_size = inputs_embeds.shape[0]
1415
+
1416
+ if self.config.pad_token_id is None and batch_size != 1:
1417
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1418
+ if self.config.pad_token_id is None:
1419
+ sequence_lengths = -1
1420
+ else:
1421
+ if input_ids is not None:
1422
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1423
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1424
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1425
+ sequence_lengths = sequence_lengths.to(logits.device)
1426
+ else:
1427
+ sequence_lengths = -1
1428
+
1429
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1430
+
1431
+ loss = None
1432
+ if labels is not None:
1433
+ labels = labels.to(logits.device)
1434
+ if self.config.problem_type is None:
1435
+ if self.num_labels == 1:
1436
+ self.config.problem_type = "regression"
1437
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1438
+ self.config.problem_type = "single_label_classification"
1439
+ else:
1440
+ self.config.problem_type = "multi_label_classification"
1441
+
1442
+ if self.config.problem_type == "regression":
1443
+ loss_fct = MSELoss()
1444
+ if self.num_labels == 1:
1445
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1446
+ else:
1447
+ loss = loss_fct(pooled_logits, labels)
1448
+ elif self.config.problem_type == "single_label_classification":
1449
+ loss_fct = CrossEntropyLoss()
1450
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1451
+ elif self.config.problem_type == "multi_label_classification":
1452
+ loss_fct = BCEWithLogitsLoss()
1453
+ loss = loss_fct(pooled_logits, labels)
1454
+ if not return_dict:
1455
+ output = (pooled_logits,) + transformer_outputs[1:]
1456
+ return ((loss,) + output) if loss is not None else output
1457
+
1458
+ return SequenceClassifierOutputWithPast(
1459
+ loss=loss,
1460
+ logits=pooled_logits,
1461
+ past_key_values=transformer_outputs.past_key_values,
1462
+ hidden_states=transformer_outputs.hidden_states,
1463
+ attentions=transformer_outputs.attentions,
1464
+ )
1465
+
1466
+
1467
+ @add_start_docstrings(
1468
+ """
1469
+ The HyperCLOVAX Model transformer with a span classification head on top for extractive question-answering tasks like
1470
+ SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1471
+ """,
1472
+ HyperCLOVAX_START_DOCSTRING,
1473
+ )
1474
+ class HyperCLOVAXForQuestionAnswering(HyperCLOVAXPreTrainedModel):
1475
+ base_model_prefix = "transformer"
1476
+
1477
+ # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->HyperCLOVAX
1478
+ def __init__(self, config):
1479
+ super().__init__(config)
1480
+ self.transformer = HyperCLOVAXModel(config)
1481
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
1482
+
1483
+ # Initialize weights and apply final processing
1484
+ self.post_init()
1485
+
1486
+ def get_input_embeddings(self):
1487
+ return self.transformer.embed_tokens
1488
+
1489
+ def set_input_embeddings(self, value):
1490
+ self.transformer.embed_tokens = value
1491
+
1492
+ @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
1493
+ def forward(
1494
+ self,
1495
+ input_ids: Optional[torch.LongTensor] = None,
1496
+ attention_mask: Optional[torch.FloatTensor] = None,
1497
+ position_ids: Optional[torch.LongTensor] = None,
1498
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1499
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1500
+ start_positions: Optional[torch.LongTensor] = None,
1501
+ end_positions: Optional[torch.LongTensor] = None,
1502
+ output_attentions: Optional[bool] = None,
1503
+ output_hidden_states: Optional[bool] = None,
1504
+ return_dict: Optional[bool] = None,
1505
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
1506
+ r"""
1507
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1508
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1509
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1510
+ are not taken into account for computing the loss.
1511
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1512
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1513
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1514
+ are not taken into account for computing the loss.
1515
+ """
1516
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1517
+
1518
+ outputs = self.transformer(
1519
+ input_ids,
1520
+ attention_mask=attention_mask,
1521
+ position_ids=position_ids,
1522
+ past_key_values=past_key_values,
1523
+ inputs_embeds=inputs_embeds,
1524
+ output_attentions=output_attentions,
1525
+ output_hidden_states=output_hidden_states,
1526
+ return_dict=return_dict,
1527
+ )
1528
+
1529
+ sequence_output = outputs[0]
1530
+
1531
+ logits = self.qa_outputs(sequence_output)
1532
+ start_logits, end_logits = logits.split(1, dim=-1)
1533
+ start_logits = start_logits.squeeze(-1).contiguous()
1534
+ end_logits = end_logits.squeeze(-1).contiguous()
1535
+
1536
+ total_loss = None
1537
+ if start_positions is not None and end_positions is not None:
1538
+ # If we are on multi-GPU, split add a dimension
1539
+ if len(start_positions.size()) > 1:
1540
+ start_positions = start_positions.squeeze(-1).to(start_logits.device)
1541
+ if len(end_positions.size()) > 1:
1542
+ end_positions = end_positions.squeeze(-1).to(end_logits.device)
1543
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1544
+ ignored_index = start_logits.size(1)
1545
+ start_positions = start_positions.clamp(0, ignored_index)
1546
+ end_positions = end_positions.clamp(0, ignored_index)
1547
+
1548
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1549
+ start_loss = loss_fct(start_logits, start_positions)
1550
+ end_loss = loss_fct(end_logits, end_positions)
1551
+ total_loss = (start_loss + end_loss) / 2
1552
+
1553
+ if not return_dict:
1554
+ output = (start_logits, end_logits) + outputs[2:]
1555
+ return ((total_loss,) + output) if total_loss is not None else output
1556
+
1557
+ return QuestionAnsweringModelOutput(
1558
+ loss=total_loss,
1559
+ start_logits=start_logits,
1560
+ end_logits=end_logits,
1561
+ hidden_states=outputs.hidden_states,
1562
+ attentions=outputs.attentions,
1563
+ )
1564
+
1565
+
1566
+ @add_start_docstrings(
1567
+ """
1568
+ The HyperCLOVAX Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1569
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
1570
+ """,
1571
+ HyperCLOVAX_START_DOCSTRING,
1572
+ )
1573
+ class HyperCLOVAXForTokenClassification(HyperCLOVAXPreTrainedModel):
1574
+ def __init__(self, config):
1575
+ super().__init__(config)
1576
+ self.num_labels = config.num_labels
1577
+ self.model = HyperCLOVAXModel(config)
1578
+ if getattr(config, "classifier_dropout", None) is not None:
1579
+ classifier_dropout = config.classifier_dropout
1580
+ elif getattr(config, "hidden_dropout", None) is not None:
1581
+ classifier_dropout = config.hidden_dropout
1582
+ else:
1583
+ classifier_dropout = 0.1
1584
+ self.dropout = nn.Dropout(classifier_dropout)
1585
+ self.score = nn.Linear(config.hidden_size, config.num_labels)
1586
+
1587
+ # Initialize weights and apply final processing
1588
+ self.post_init()
1589
+
1590
+ def get_input_embeddings(self):
1591
+ return self.model.embed_tokens
1592
+
1593
+ def set_input_embeddings(self, value):
1594
+ self.model.embed_tokens = value
1595
+
1596
+ @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
1597
+ def forward(
1598
+ self,
1599
+ input_ids: Optional[torch.LongTensor] = None,
1600
+ attention_mask: Optional[torch.Tensor] = None,
1601
+ position_ids: Optional[torch.LongTensor] = None,
1602
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1603
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1604
+ labels: Optional[torch.LongTensor] = None,
1605
+ use_cache: Optional[bool] = None,
1606
+ output_attentions: Optional[bool] = None,
1607
+ output_hidden_states: Optional[bool] = None,
1608
+ return_dict: Optional[bool] = None,
1609
+ ) -> Union[Tuple, TokenClassifierOutput]:
1610
+ r"""
1611
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1612
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1613
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1614
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1615
+ """
1616
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1617
+
1618
+ outputs = self.model(
1619
+ input_ids,
1620
+ attention_mask=attention_mask,
1621
+ position_ids=position_ids,
1622
+ past_key_values=past_key_values,
1623
+ inputs_embeds=inputs_embeds,
1624
+ use_cache=use_cache,
1625
+ output_attentions=output_attentions,
1626
+ output_hidden_states=output_hidden_states,
1627
+ return_dict=return_dict,
1628
+ )
1629
+ sequence_output = outputs[0]
1630
+ sequence_output = self.dropout(sequence_output)
1631
+ logits = self.score(sequence_output)
1632
+
1633
+ loss = None
1634
+ if labels is not None:
1635
+ loss_fct = CrossEntropyLoss()
1636
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1637
+
1638
+ if not return_dict:
1639
+ output = (logits,) + outputs[2:]
1640
+ return ((loss,) + output) if loss is not None else output
1641
+
1642
+ return TokenClassifierOutput(
1643
+ loss=loss,
1644
+ logits=logits,
1645
+ hidden_states=outputs.hidden_states,
1646
+ attentions=outputs.attentions,
1647
+ )
1648
+
1649
+
1650
+ ################################################################################################
1651
+ ################################################################################################
1652
+ """
1653
+ liger kernel monkey patching
1654
+ https://github.com/linkedin/Liger-Kernel/blob/v0.5.2/src/liger_kernel/transformers/monkey_patch.py
1655
+ """
1656
+
1657
+ import inspect
1658
+ import logging
1659
+ from functools import partial
1660
+ from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
1661
+
1662
+ import torch
1663
+ import torch.nn.functional as F
1664
+ import transformers
1665
+ from packaging import version
1666
+ from torch.nn import CrossEntropyLoss
1667
+ from transformers import PreTrainedModel
1668
+
1669
+ if TYPE_CHECKING:
1670
+ from transformers.cache_utils import Cache
1671
+
1672
+ import sys
1673
+
1674
+ from packaging.version import parse
1675
+
1676
+ if sys.version_info < (3, 8):
1677
+ import importlib_metadata
1678
+ else:
1679
+ import importlib.metadata as importlib_metadata
1680
+
1681
+ try:
1682
+ from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
1683
+ from liger_kernel.transformers.functional import liger_cross_entropy
1684
+ from liger_kernel.transformers.fused_linear_cross_entropy import (
1685
+ LigerFusedLinearCrossEntropyLoss,
1686
+ )
1687
+ from liger_kernel.transformers.rms_norm import LigerRMSNorm
1688
+ from liger_kernel.transformers.rope import liger_rotary_pos_emb
1689
+ from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
1690
+
1691
+ _is_liger_kernel_available = True
1692
+
1693
+ LIGER_KERNEL_MATCHING_VERSION = parse("0.5.2")
1694
+ liger_kernel_version = parse(importlib_metadata.version("liger_kernel"))
1695
+ _is_liger_kernel_version_matching = (
1696
+ liger_kernel_version.major,
1697
+ liger_kernel_version.minor,
1698
+ liger_kernel_version.release[-1],
1699
+ ) == (
1700
+ LIGER_KERNEL_MATCHING_VERSION.major,
1701
+ LIGER_KERNEL_MATCHING_VERSION.minor,
1702
+ LIGER_KERNEL_MATCHING_VERSION.release[-1],
1703
+ )
1704
+ except Exception:
1705
+ _is_liger_kernel_available = False
1706
+ _is_liger_kernel_version_matching = False
1707
+
1708
+
1709
+ def lce_forward_deprecated(
1710
+ self,
1711
+ input_ids: torch.LongTensor = None,
1712
+ attention_mask: Optional[torch.Tensor] = None,
1713
+ position_ids: Optional[torch.LongTensor] = None,
1714
+ past_key_values: Optional[Union["Cache", List[torch.FloatTensor]]] = None,
1715
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1716
+ labels: Optional[torch.LongTensor] = None,
1717
+ use_cache: Optional[bool] = None,
1718
+ output_attentions: Optional[bool] = None,
1719
+ output_hidden_states: Optional[bool] = None,
1720
+ return_dict: Optional[bool] = None,
1721
+ cache_position: Optional[torch.LongTensor] = None,
1722
+ num_logits_to_keep: int = 0,
1723
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1724
+
1725
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1726
+ output_hidden_states = (
1727
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1728
+ )
1729
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1730
+
1731
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1732
+ outputs = self.model(
1733
+ input_ids=input_ids,
1734
+ attention_mask=attention_mask,
1735
+ position_ids=position_ids,
1736
+ past_key_values=past_key_values,
1737
+ inputs_embeds=inputs_embeds,
1738
+ use_cache=use_cache,
1739
+ output_attentions=output_attentions,
1740
+ output_hidden_states=output_hidden_states,
1741
+ return_dict=return_dict,
1742
+ cache_position=cache_position,
1743
+ )
1744
+ hidden_states = outputs[0]
1745
+
1746
+ loss = None
1747
+ logits = None
1748
+
1749
+ if self.training and (labels is not None):
1750
+ if num_logits_to_keep != 0:
1751
+ hidden_states = hidden_states[:, -num_logits_to_keep:, :] # not sure if it has bug
1752
+ hidden_states = hidden_states * self.config.logits_scaling ## muP
1753
+
1754
+ shift_hidden_states = hidden_states[..., :-1, :].contiguous()
1755
+ shift_labels = labels[..., 1:].contiguous()
1756
+
1757
+ # flatten tokens
1758
+ shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
1759
+ shift_labels = shift_labels.view(-1)
1760
+
1761
+ lce = LigerFusedLinearCrossEntropyLoss()
1762
+ loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
1763
+
1764
+ else:
1765
+ assert self.config.pretraining_tp == 1, "not supported"
1766
+ logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
1767
+ logits = logits * self.config.logits_scaling ## muP
1768
+
1769
+ if labels is not None:
1770
+ # Upcast to float if we need to compute the loss to avoid potential precision issues
1771
+ logits = logits.float()
1772
+ # Shift so that tokens < n predict n
1773
+ shift_logits = logits[..., :-1, :].contiguous()
1774
+ shift_labels = labels[..., 1:].contiguous()
1775
+ # Flatten the tokens
1776
+ loss_fct = CrossEntropyLoss()
1777
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1778
+ shift_labels = shift_labels.view(-1)
1779
+ # Enable model parallelism
1780
+ shift_labels = shift_labels.to(shift_logits.device)
1781
+ loss = loss_fct(shift_logits, shift_labels)
1782
+
1783
+ if not return_dict:
1784
+ output = (logits,) + outputs[1:]
1785
+ return (loss,) + output if loss is not None else output
1786
+
1787
+ return CausalLMOutputWithPast(
1788
+ loss=loss,
1789
+ logits=logits,
1790
+ past_key_values=outputs.past_key_values,
1791
+ hidden_states=outputs.hidden_states,
1792
+ attentions=outputs.attentions,
1793
+ )
1794
+
1795
+
1796
+ def _bind_method_to_module(module, method_name: str, new_method: Callable):
1797
+ # Binds a new method to a module instance so that self is passed as the first argument
1798
+ module.__dict__[method_name] = new_method.__get__(module, module.__class__)
1799
+
1800
+
1801
+ def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama", in_place=True):
1802
+ module.offset = offset
1803
+ module.casting_mode = casting_mode
1804
+ module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
1805
+ module.in_place = in_place
1806
+ _bind_method_to_module(module, "forward", LigerRMSNorm.forward)
1807
+ _bind_method_to_module(module, "extra_repr", LigerRMSNorm.extra_repr)
1808
+
1809
+
1810
+ def apply_liger_kernel_to_hyperclovax(
1811
+ rope: bool = True,
1812
+ cross_entropy: bool = False,
1813
+ fused_linear_cross_entropy: bool = True,
1814
+ rms_norm: bool = True,
1815
+ swiglu: bool = True,
1816
+ model: PreTrainedModel = None,
1817
+ ) -> None:
1818
+
1819
+ assert not cross_entropy, "not supported"
1820
+ if rope:
1821
+ apply_rotary_pos_emb = liger_rotary_pos_emb
1822
+ if rms_norm:
1823
+ HyperCLOVAXRMSNorm = LigerRMSNorm
1824
+ if swiglu:
1825
+ HyperCLOVAXMLP = LigerSwiGLUMLP
1826
+ # to use VLM forward in VLM repo
1827
+ # if fused_linear_cross_entropy:
1828
+ # HyperCLOVAXForCausalLM.forward = lce_forward_deprecated
1829
+
1830
+ if model is not None:
1831
+ # The model instance already exists, so we need to additionally patch the
1832
+ # instance variables that reference already-instantiated modules (e.g. LlamaRMSNorm or LlamaMLP)
1833
+
1834
+ # get the base model from the model instance
1835
+ base_model: HyperCLOVAXModel = getattr(model, model.base_model_prefix, model)
1836
+
1837
+ if rms_norm:
1838
+ _patch_rms_norm_module(base_model.norm)
1839
+
1840
+ for decoder_layer in base_model.layers:
1841
+ if swiglu:
1842
+ _bind_method_to_module(decoder_layer.mlp, "forward", LigerSwiGLUMLP.forward)
1843
+ if rms_norm:
1844
+ _patch_rms_norm_module(decoder_layer.input_layernorm)
1845
+ _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
1846
+ if decoder_layer.use_post_norm:
1847
+ _patch_rms_norm_module(decoder_layer.post_norm1)
1848
+ _patch_rms_norm_module(decoder_layer.post_norm2)
1849
+
1850
+
1851
+ def _apply_liger_kernel_to_instance(model: PreTrainedModel, **kwargs) -> None:
1852
+ model_type = getattr(model, "config", None) and getattr(model.config, "model_type", None)
1853
+ assert model_type == "hyperclovax"
1854
+ apply_fn = apply_liger_kernel_to_hyperclovax
1855
+ apply_fn_signature = inspect.signature(apply_fn)
1856
+
1857
+ # Filter out the keyword arguments that are not supported by the apply function
1858
+ applicable_kwargs = {key: value for key, value in kwargs.items() if key in apply_fn_signature.parameters}
1859
+ logger.info(
1860
+ f"Applying Liger kernels to model instance with model type: {model_type} with kwargs: {applicable_kwargs}"
1861
+ )
1862
+ apply_fn(model=model, **applicable_kwargs)
1863
+
1864
+
1865
+ ################################################################################################
1866
+ ################################################################################################
special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "image_token": "<|IMAGE_PAD|>",
17
+ "pad_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "sep_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "unk_token": {
32
+ "content": "<|endoftext|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ "video_token": "<|VIDEO_PAD|>"
39
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,2079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "128000": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "128001": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "128002": {
29
+ "content": "<|stop|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "128003": {
37
+ "content": "<|endofturn|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "128004": {
45
+ "content": "<|fim_prefix|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "128005": {
53
+ "content": "<|fim_middle|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "128006": {
61
+ "content": "<|fim_suffix|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "128007": {
69
+ "content": "<repo_name>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "128008": {
77
+ "content": "<file_sep>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "128009": {
85
+ "content": "<issue_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "128010": {
93
+ "content": "<issue_comment>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "128011": {
101
+ "content": "<issue_closed>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "128012": {
109
+ "content": "<jupyter_start>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "128013": {
117
+ "content": "<jupyter_text>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "128014": {
125
+ "content": "<jupyter_code>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "128015": {
133
+ "content": "<jupyter_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "128016": {
141
+ "content": "<jupyter_script>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "128017": {
149
+ "content": "<empty_output>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "128018": {
157
+ "content": "<code_to_intermediate>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "128019": {
165
+ "content": "<intermediate_to_code>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "128020": {
173
+ "content": "<pr>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "128021": {
181
+ "content": "<pr_status>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "128022": {
189
+ "content": "<pr_is_merged>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "128023": {
197
+ "content": "<pr_base>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "128024": {
205
+ "content": "<pr_file>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "128025": {
213
+ "content": "<pr_base_code>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "128026": {
221
+ "content": "<pr_diff>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "128027": {
229
+ "content": "<pr_diff_hunk>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "128028": {
237
+ "content": "<pr_comment>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "128029": {
245
+ "content": "<pr_event_id>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "128030": {
253
+ "content": "<pr_review>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "128031": {
261
+ "content": "<pr_review_state>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "128032": {
269
+ "content": "<pr_review_comment>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "128033": {
277
+ "content": "<pr_in_reply_to_review_id>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "128034": {
285
+ "content": "<pr_in_reply_to_comment_id>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "128035": {
293
+ "content": "<pr_diff_hunk_comment_line>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "128036": {
301
+ "content": "<NAME>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "128037": {
309
+ "content": "<EMAIL>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "128038": {
317
+ "content": "<KEY>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "128039": {
325
+ "content": "<PASSWORD>",
326
+ "lstrip": false,
327
+ "normalized": false,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "128040": {
333
+ "content": "<think>",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": false
339
+ },
340
+ "128041": {
341
+ "content": "</think>",
342
+ "lstrip": false,
343
+ "normalized": false,
344
+ "rstrip": false,
345
+ "single_word": false,
346
+ "special": false
347
+ },
348
+ "128042": {
349
+ "content": "<tool_call>",
350
+ "lstrip": false,
351
+ "normalized": false,
352
+ "rstrip": false,
353
+ "single_word": false,
354
+ "special": false
355
+ },
356
+ "128043": {
357
+ "content": "</tool_call>",
358
+ "lstrip": false,
359
+ "normalized": false,
360
+ "rstrip": false,
361
+ "single_word": false,
362
+ "special": false
363
+ },
364
+ "128044": {
365
+ "content": "<arg_key>",
366
+ "lstrip": false,
367
+ "normalized": false,
368
+ "rstrip": false,
369
+ "single_word": false,
370
+ "special": false
371
+ },
372
+ "128045": {
373
+ "content": "</arg_key>",
374
+ "lstrip": false,
375
+ "normalized": false,
376
+ "rstrip": false,
377
+ "single_word": false,
378
+ "special": false
379
+ },
380
+ "128046": {
381
+ "content": "<arg_value>",
382
+ "lstrip": false,
383
+ "normalized": false,
384
+ "rstrip": false,
385
+ "single_word": false,
386
+ "special": false
387
+ },
388
+ "128047": {
389
+ "content": "</arg_value>",
390
+ "lstrip": false,
391
+ "normalized": false,
392
+ "rstrip": false,
393
+ "single_word": false,
394
+ "special": false
395
+ },
396
+ "128048": {
397
+ "content": "<tool_response>",
398
+ "lstrip": false,
399
+ "normalized": false,
400
+ "rstrip": false,
401
+ "single_word": false,
402
+ "special": false
403
+ },
404
+ "128049": {
405
+ "content": "</tool_response>",
406
+ "lstrip": false,
407
+ "normalized": false,
408
+ "rstrip": false,
409
+ "single_word": false,
410
+ "special": false
411
+ },
412
+ "128050": {
413
+ "content": "<tools>",
414
+ "lstrip": false,
415
+ "normalized": false,
416
+ "rstrip": false,
417
+ "single_word": false,
418
+ "special": false
419
+ },
420
+ "128051": {
421
+ "content": "</tools>",
422
+ "lstrip": false,
423
+ "normalized": false,
424
+ "rstrip": false,
425
+ "single_word": false,
426
+ "special": false
427
+ },
428
+ "128052": {
429
+ "content": "<|mime_start|>",
430
+ "lstrip": false,
431
+ "normalized": false,
432
+ "rstrip": false,
433
+ "single_word": false,
434
+ "special": true
435
+ },
436
+ "128053": {
437
+ "content": "<|mime_end|>",
438
+ "lstrip": false,
439
+ "normalized": false,
440
+ "rstrip": false,
441
+ "single_word": false,
442
+ "special": true
443
+ },
444
+ "128054": {
445
+ "content": "<|document_start|>",
446
+ "lstrip": false,
447
+ "normalized": false,
448
+ "rstrip": false,
449
+ "single_word": false,
450
+ "special": true
451
+ },
452
+ "128055": {
453
+ "content": "<|document_end|>",
454
+ "lstrip": false,
455
+ "normalized": false,
456
+ "rstrip": false,
457
+ "single_word": false,
458
+ "special": true
459
+ },
460
+ "128056": {
461
+ "content": "<|image_start|>",
462
+ "lstrip": false,
463
+ "normalized": false,
464
+ "rstrip": false,
465
+ "single_word": false,
466
+ "special": true
467
+ },
468
+ "128057": {
469
+ "content": "<|image_end|>",
470
+ "lstrip": false,
471
+ "normalized": false,
472
+ "rstrip": false,
473
+ "single_word": false,
474
+ "special": true
475
+ },
476
+ "128058": {
477
+ "content": "<|video_start|>",
478
+ "lstrip": false,
479
+ "normalized": false,
480
+ "rstrip": false,
481
+ "single_word": false,
482
+ "special": true
483
+ },
484
+ "128059": {
485
+ "content": "<|video_end|>",
486
+ "lstrip": false,
487
+ "normalized": false,
488
+ "rstrip": false,
489
+ "single_word": false,
490
+ "special": true
491
+ },
492
+ "128060": {
493
+ "content": "<|IMAGE_PAD|>",
494
+ "lstrip": false,
495
+ "normalized": false,
496
+ "rstrip": false,
497
+ "single_word": false,
498
+ "special": true
499
+ },
500
+ "128061": {
501
+ "content": "<|VIDEO_PAD|>",
502
+ "lstrip": false,
503
+ "normalized": false,
504
+ "rstrip": false,
505
+ "single_word": false,
506
+ "special": true
507
+ },
508
+ "128062": {
509
+ "content": "<|vision_aux_start|>",
510
+ "lstrip": false,
511
+ "normalized": false,
512
+ "rstrip": false,
513
+ "single_word": false,
514
+ "special": true
515
+ },
516
+ "128063": {
517
+ "content": "<|vision_aux_end|>",
518
+ "lstrip": false,
519
+ "normalized": false,
520
+ "rstrip": false,
521
+ "single_word": false,
522
+ "special": true
523
+ },
524
+ "128064": {
525
+ "content": "<|code_switching|>",
526
+ "lstrip": false,
527
+ "normalized": false,
528
+ "rstrip": false,
529
+ "single_word": false,
530
+ "special": true
531
+ },
532
+ "128065": {
533
+ "content": "<|back_translation|>",
534
+ "lstrip": false,
535
+ "normalized": false,
536
+ "rstrip": false,
537
+ "single_word": false,
538
+ "special": true
539
+ },
540
+ "128066": {
541
+ "content": "<|instruction_pretraining|>",
542
+ "lstrip": false,
543
+ "normalized": false,
544
+ "rstrip": false,
545
+ "single_word": false,
546
+ "special": true
547
+ },
548
+ "128067": {
549
+ "content": "<|_placeholder_067|>",
550
+ "lstrip": false,
551
+ "normalized": false,
552
+ "rstrip": false,
553
+ "single_word": false,
554
+ "special": true
555
+ },
556
+ "128068": {
557
+ "content": "<|_placeholder_068|>",
558
+ "lstrip": false,
559
+ "normalized": false,
560
+ "rstrip": false,
561
+ "single_word": false,
562
+ "special": true
563
+ },
564
+ "128069": {
565
+ "content": "<|_placeholder_069|>",
566
+ "lstrip": false,
567
+ "normalized": false,
568
+ "rstrip": false,
569
+ "single_word": false,
570
+ "special": true
571
+ },
572
+ "128070": {
573
+ "content": "<|_placeholder_070|>",
574
+ "lstrip": false,
575
+ "normalized": false,
576
+ "rstrip": false,
577
+ "single_word": false,
578
+ "special": true
579
+ },
580
+ "128071": {
581
+ "content": "<|_placeholder_071|>",
582
+ "lstrip": false,
583
+ "normalized": false,
584
+ "rstrip": false,
585
+ "single_word": false,
586
+ "special": true
587
+ },
588
+ "128072": {
589
+ "content": "<|_placeholder_072|>",
590
+ "lstrip": false,
591
+ "normalized": false,
592
+ "rstrip": false,
593
+ "single_word": false,
594
+ "special": true
595
+ },
596
+ "128073": {
597
+ "content": "<|_placeholder_073|>",
598
+ "lstrip": false,
599
+ "normalized": false,
600
+ "rstrip": false,
601
+ "single_word": false,
602
+ "special": true
603
+ },
604
+ "128074": {
605
+ "content": "<|_placeholder_074|>",
606
+ "lstrip": false,
607
+ "normalized": false,
608
+ "rstrip": false,
609
+ "single_word": false,
610
+ "special": true
611
+ },
612
+ "128075": {
613
+ "content": "<|_placeholder_075|>",
614
+ "lstrip": false,
615
+ "normalized": false,
616
+ "rstrip": false,
617
+ "single_word": false,
618
+ "special": true
619
+ },
620
+ "128076": {
621
+ "content": "<|_placeholder_076|>",
622
+ "lstrip": false,
623
+ "normalized": false,
624
+ "rstrip": false,
625
+ "single_word": false,
626
+ "special": true
627
+ },
628
+ "128077": {
629
+ "content": "<|_placeholder_077|>",
630
+ "lstrip": false,
631
+ "normalized": false,
632
+ "rstrip": false,
633
+ "single_word": false,
634
+ "special": true
635
+ },
636
+ "128078": {
637
+ "content": "<|_placeholder_078|>",
638
+ "lstrip": false,
639
+ "normalized": false,
640
+ "rstrip": false,
641
+ "single_word": false,
642
+ "special": true
643
+ },
644
+ "128079": {
645
+ "content": "<|_placeholder_079|>",
646
+ "lstrip": false,
647
+ "normalized": false,
648
+ "rstrip": false,
649
+ "single_word": false,
650
+ "special": true
651
+ },
652
+ "128080": {
653
+ "content": "<|_placeholder_080|>",
654
+ "lstrip": false,
655
+ "normalized": false,
656
+ "rstrip": false,
657
+ "single_word": false,
658
+ "special": true
659
+ },
660
+ "128081": {
661
+ "content": "<|_placeholder_081|>",
662
+ "lstrip": false,
663
+ "normalized": false,
664
+ "rstrip": false,
665
+ "single_word": false,
666
+ "special": true
667
+ },
668
+ "128082": {
669
+ "content": "<|_placeholder_082|>",
670
+ "lstrip": false,
671
+ "normalized": false,
672
+ "rstrip": false,
673
+ "single_word": false,
674
+ "special": true
675
+ },
676
+ "128083": {
677
+ "content": "<|_placeholder_083|>",
678
+ "lstrip": false,
679
+ "normalized": false,
680
+ "rstrip": false,
681
+ "single_word": false,
682
+ "special": true
683
+ },
684
+ "128084": {
685
+ "content": "<|_placeholder_084|>",
686
+ "lstrip": false,
687
+ "normalized": false,
688
+ "rstrip": false,
689
+ "single_word": false,
690
+ "special": true
691
+ },
692
+ "128085": {
693
+ "content": "<|_placeholder_085|>",
694
+ "lstrip": false,
695
+ "normalized": false,
696
+ "rstrip": false,
697
+ "single_word": false,
698
+ "special": true
699
+ },
700
+ "128086": {
701
+ "content": "<|_placeholder_086|>",
702
+ "lstrip": false,
703
+ "normalized": false,
704
+ "rstrip": false,
705
+ "single_word": false,
706
+ "special": true
707
+ },
708
+ "128087": {
709
+ "content": "<|_placeholder_087|>",
710
+ "lstrip": false,
711
+ "normalized": false,
712
+ "rstrip": false,
713
+ "single_word": false,
714
+ "special": true
715
+ },
716
+ "128088": {
717
+ "content": "<|_placeholder_088|>",
718
+ "lstrip": false,
719
+ "normalized": false,
720
+ "rstrip": false,
721
+ "single_word": false,
722
+ "special": true
723
+ },
724
+ "128089": {
725
+ "content": "<|_placeholder_089|>",
726
+ "lstrip": false,
727
+ "normalized": false,
728
+ "rstrip": false,
729
+ "single_word": false,
730
+ "special": true
731
+ },
732
+ "128090": {
733
+ "content": "<|_placeholder_090|>",
734
+ "lstrip": false,
735
+ "normalized": false,
736
+ "rstrip": false,
737
+ "single_word": false,
738
+ "special": true
739
+ },
740
+ "128091": {
741
+ "content": "<|_placeholder_091|>",
742
+ "lstrip": false,
743
+ "normalized": false,
744
+ "rstrip": false,
745
+ "single_word": false,
746
+ "special": true
747
+ },
748
+ "128092": {
749
+ "content": "<|_placeholder_092|>",
750
+ "lstrip": false,
751
+ "normalized": false,
752
+ "rstrip": false,
753
+ "single_word": false,
754
+ "special": true
755
+ },
756
+ "128093": {
757
+ "content": "<|_placeholder_093|>",
758
+ "lstrip": false,
759
+ "normalized": false,
760
+ "rstrip": false,
761
+ "single_word": false,
762
+ "special": true
763
+ },
764
+ "128094": {
765
+ "content": "<|_placeholder_094|>",
766
+ "lstrip": false,
767
+ "normalized": false,
768
+ "rstrip": false,
769
+ "single_word": false,
770
+ "special": true
771
+ },
772
+ "128095": {
773
+ "content": "<|_placeholder_095|>",
774
+ "lstrip": false,
775
+ "normalized": false,
776
+ "rstrip": false,
777
+ "single_word": false,
778
+ "special": true
779
+ },
780
+ "128096": {
781
+ "content": "<|_placeholder_096|>",
782
+ "lstrip": false,
783
+ "normalized": false,
784
+ "rstrip": false,
785
+ "single_word": false,
786
+ "special": true
787
+ },
788
+ "128097": {
789
+ "content": "<|_placeholder_097|>",
790
+ "lstrip": false,
791
+ "normalized": false,
792
+ "rstrip": false,
793
+ "single_word": false,
794
+ "special": true
795
+ },
796
+ "128098": {
797
+ "content": "<|_placeholder_098|>",
798
+ "lstrip": false,
799
+ "normalized": false,
800
+ "rstrip": false,
801
+ "single_word": false,
802
+ "special": true
803
+ },
804
+ "128099": {
805
+ "content": "<|_placeholder_099|>",
806
+ "lstrip": false,
807
+ "normalized": false,
808
+ "rstrip": false,
809
+ "single_word": false,
810
+ "special": true
811
+ },
812
+ "128100": {
813
+ "content": "<|_placeholder_100|>",
814
+ "lstrip": false,
815
+ "normalized": false,
816
+ "rstrip": false,
817
+ "single_word": false,
818
+ "special": true
819
+ },
820
+ "128101": {
821
+ "content": "<|_placeholder_101|>",
822
+ "lstrip": false,
823
+ "normalized": false,
824
+ "rstrip": false,
825
+ "single_word": false,
826
+ "special": true
827
+ },
828
+ "128102": {
829
+ "content": "<|_placeholder_102|>",
830
+ "lstrip": false,
831
+ "normalized": false,
832
+ "rstrip": false,
833
+ "single_word": false,
834
+ "special": true
835
+ },
836
+ "128103": {
837
+ "content": "<|_placeholder_103|>",
838
+ "lstrip": false,
839
+ "normalized": false,
840
+ "rstrip": false,
841
+ "single_word": false,
842
+ "special": true
843
+ },
844
+ "128104": {
845
+ "content": "<|_placeholder_104|>",
846
+ "lstrip": false,
847
+ "normalized": false,
848
+ "rstrip": false,
849
+ "single_word": false,
850
+ "special": true
851
+ },
852
+ "128105": {
853
+ "content": "<|_placeholder_105|>",
854
+ "lstrip": false,
855
+ "normalized": false,
856
+ "rstrip": false,
857
+ "single_word": false,
858
+ "special": true
859
+ },
860
+ "128106": {
861
+ "content": "<|_placeholder_106|>",
862
+ "lstrip": false,
863
+ "normalized": false,
864
+ "rstrip": false,
865
+ "single_word": false,
866
+ "special": true
867
+ },
868
+ "128107": {
869
+ "content": "<|_placeholder_107|>",
870
+ "lstrip": false,
871
+ "normalized": false,
872
+ "rstrip": false,
873
+ "single_word": false,
874
+ "special": true
875
+ },
876
+ "128108": {
877
+ "content": "<|_placeholder_108|>",
878
+ "lstrip": false,
879
+ "normalized": false,
880
+ "rstrip": false,
881
+ "single_word": false,
882
+ "special": true
883
+ },
884
+ "128109": {
885
+ "content": "<|_placeholder_109|>",
886
+ "lstrip": false,
887
+ "normalized": false,
888
+ "rstrip": false,
889
+ "single_word": false,
890
+ "special": true
891
+ },
892
+ "128110": {
893
+ "content": "<|_placeholder_110|>",
894
+ "lstrip": false,
895
+ "normalized": false,
896
+ "rstrip": false,
897
+ "single_word": false,
898
+ "special": true
899
+ },
900
+ "128111": {
901
+ "content": "<|_placeholder_111|>",
902
+ "lstrip": false,
903
+ "normalized": false,
904
+ "rstrip": false,
905
+ "single_word": false,
906
+ "special": true
907
+ },
908
+ "128112": {
909
+ "content": "<|_placeholder_112|>",
910
+ "lstrip": false,
911
+ "normalized": false,
912
+ "rstrip": false,
913
+ "single_word": false,
914
+ "special": true
915
+ },
916
+ "128113": {
917
+ "content": "<|_placeholder_113|>",
918
+ "lstrip": false,
919
+ "normalized": false,
920
+ "rstrip": false,
921
+ "single_word": false,
922
+ "special": true
923
+ },
924
+ "128114": {
925
+ "content": "<|_placeholder_114|>",
926
+ "lstrip": false,
927
+ "normalized": false,
928
+ "rstrip": false,
929
+ "single_word": false,
930
+ "special": true
931
+ },
932
+ "128115": {
933
+ "content": "<|_placeholder_115|>",
934
+ "lstrip": false,
935
+ "normalized": false,
936
+ "rstrip": false,
937
+ "single_word": false,
938
+ "special": true
939
+ },
940
+ "128116": {
941
+ "content": "<|_placeholder_116|>",
942
+ "lstrip": false,
943
+ "normalized": false,
944
+ "rstrip": false,
945
+ "single_word": false,
946
+ "special": true
947
+ },
948
+ "128117": {
949
+ "content": "<|_placeholder_117|>",
950
+ "lstrip": false,
951
+ "normalized": false,
952
+ "rstrip": false,
953
+ "single_word": false,
954
+ "special": true
955
+ },
956
+ "128118": {
957
+ "content": "<|_placeholder_118|>",
958
+ "lstrip": false,
959
+ "normalized": false,
960
+ "rstrip": false,
961
+ "single_word": false,
962
+ "special": true
963
+ },
964
+ "128119": {
965
+ "content": "<|_placeholder_119|>",
966
+ "lstrip": false,
967
+ "normalized": false,
968
+ "rstrip": false,
969
+ "single_word": false,
970
+ "special": true
971
+ },
972
+ "128120": {
973
+ "content": "<|_placeholder_120|>",
974
+ "lstrip": false,
975
+ "normalized": false,
976
+ "rstrip": false,
977
+ "single_word": false,
978
+ "special": true
979
+ },
980
+ "128121": {
981
+ "content": "<|_placeholder_121|>",
982
+ "lstrip": false,
983
+ "normalized": false,
984
+ "rstrip": false,
985
+ "single_word": false,
986
+ "special": true
987
+ },
988
+ "128122": {
989
+ "content": "<|_placeholder_122|>",
990
+ "lstrip": false,
991
+ "normalized": false,
992
+ "rstrip": false,
993
+ "single_word": false,
994
+ "special": true
995
+ },
996
+ "128123": {
997
+ "content": "<|_placeholder_123|>",
998
+ "lstrip": false,
999
+ "normalized": false,
1000
+ "rstrip": false,
1001
+ "single_word": false,
1002
+ "special": true
1003
+ },
1004
+ "128124": {
1005
+ "content": "<|_placeholder_124|>",
1006
+ "lstrip": false,
1007
+ "normalized": false,
1008
+ "rstrip": false,
1009
+ "single_word": false,
1010
+ "special": true
1011
+ },
1012
+ "128125": {
1013
+ "content": "<|_placeholder_125|>",
1014
+ "lstrip": false,
1015
+ "normalized": false,
1016
+ "rstrip": false,
1017
+ "single_word": false,
1018
+ "special": true
1019
+ },
1020
+ "128126": {
1021
+ "content": "<|_placeholder_126|>",
1022
+ "lstrip": false,
1023
+ "normalized": false,
1024
+ "rstrip": false,
1025
+ "single_word": false,
1026
+ "special": true
1027
+ },
1028
+ "128127": {
1029
+ "content": "<|_placeholder_127|>",
1030
+ "lstrip": false,
1031
+ "normalized": false,
1032
+ "rstrip": false,
1033
+ "single_word": false,
1034
+ "special": true
1035
+ },
1036
+ "128128": {
1037
+ "content": "<|_placeholder_128|>",
1038
+ "lstrip": false,
1039
+ "normalized": false,
1040
+ "rstrip": false,
1041
+ "single_word": false,
1042
+ "special": true
1043
+ },
1044
+ "128129": {
1045
+ "content": "<|_placeholder_129|>",
1046
+ "lstrip": false,
1047
+ "normalized": false,
1048
+ "rstrip": false,
1049
+ "single_word": false,
1050
+ "special": true
1051
+ },
1052
+ "128130": {
1053
+ "content": "<|_placeholder_130|>",
1054
+ "lstrip": false,
1055
+ "normalized": false,
1056
+ "rstrip": false,
1057
+ "single_word": false,
1058
+ "special": true
1059
+ },
1060
+ "128131": {
1061
+ "content": "<|_placeholder_131|>",
1062
+ "lstrip": false,
1063
+ "normalized": false,
1064
+ "rstrip": false,
1065
+ "single_word": false,
1066
+ "special": true
1067
+ },
1068
+ "128132": {
1069
+ "content": "<|_placeholder_132|>",
1070
+ "lstrip": false,
1071
+ "normalized": false,
1072
+ "rstrip": false,
1073
+ "single_word": false,
1074
+ "special": true
1075
+ },
1076
+ "128133": {
1077
+ "content": "<|_placeholder_133|>",
1078
+ "lstrip": false,
1079
+ "normalized": false,
1080
+ "rstrip": false,
1081
+ "single_word": false,
1082
+ "special": true
1083
+ },
1084
+ "128134": {
1085
+ "content": "<|_placeholder_134|>",
1086
+ "lstrip": false,
1087
+ "normalized": false,
1088
+ "rstrip": false,
1089
+ "single_word": false,
1090
+ "special": true
1091
+ },
1092
+ "128135": {
1093
+ "content": "<|_placeholder_135|>",
1094
+ "lstrip": false,
1095
+ "normalized": false,
1096
+ "rstrip": false,
1097
+ "single_word": false,
1098
+ "special": true
1099
+ },
1100
+ "128136": {
1101
+ "content": "<|_placeholder_136|>",
1102
+ "lstrip": false,
1103
+ "normalized": false,
1104
+ "rstrip": false,
1105
+ "single_word": false,
1106
+ "special": true
1107
+ },
1108
+ "128137": {
1109
+ "content": "<|_placeholder_137|>",
1110
+ "lstrip": false,
1111
+ "normalized": false,
1112
+ "rstrip": false,
1113
+ "single_word": false,
1114
+ "special": true
1115
+ },
1116
+ "128138": {
1117
+ "content": "<|_placeholder_138|>",
1118
+ "lstrip": false,
1119
+ "normalized": false,
1120
+ "rstrip": false,
1121
+ "single_word": false,
1122
+ "special": true
1123
+ },
1124
+ "128139": {
1125
+ "content": "<|_placeholder_139|>",
1126
+ "lstrip": false,
1127
+ "normalized": false,
1128
+ "rstrip": false,
1129
+ "single_word": false,
1130
+ "special": true
1131
+ },
1132
+ "128140": {
1133
+ "content": "<|_placeholder_140|>",
1134
+ "lstrip": false,
1135
+ "normalized": false,
1136
+ "rstrip": false,
1137
+ "single_word": false,
1138
+ "special": true
1139
+ },
1140
+ "128141": {
1141
+ "content": "<|_placeholder_141|>",
1142
+ "lstrip": false,
1143
+ "normalized": false,
1144
+ "rstrip": false,
1145
+ "single_word": false,
1146
+ "special": true
1147
+ },
1148
+ "128142": {
1149
+ "content": "<|_placeholder_142|>",
1150
+ "lstrip": false,
1151
+ "normalized": false,
1152
+ "rstrip": false,
1153
+ "single_word": false,
1154
+ "special": true
1155
+ },
1156
+ "128143": {
1157
+ "content": "<|_placeholder_143|>",
1158
+ "lstrip": false,
1159
+ "normalized": false,
1160
+ "rstrip": false,
1161
+ "single_word": false,
1162
+ "special": true
1163
+ },
1164
+ "128144": {
1165
+ "content": "<|_placeholder_144|>",
1166
+ "lstrip": false,
1167
+ "normalized": false,
1168
+ "rstrip": false,
1169
+ "single_word": false,
1170
+ "special": true
1171
+ },
1172
+ "128145": {
1173
+ "content": "<|_placeholder_145|>",
1174
+ "lstrip": false,
1175
+ "normalized": false,
1176
+ "rstrip": false,
1177
+ "single_word": false,
1178
+ "special": true
1179
+ },
1180
+ "128146": {
1181
+ "content": "<|_placeholder_146|>",
1182
+ "lstrip": false,
1183
+ "normalized": false,
1184
+ "rstrip": false,
1185
+ "single_word": false,
1186
+ "special": true
1187
+ },
1188
+ "128147": {
1189
+ "content": "<|_placeholder_147|>",
1190
+ "lstrip": false,
1191
+ "normalized": false,
1192
+ "rstrip": false,
1193
+ "single_word": false,
1194
+ "special": true
1195
+ },
1196
+ "128148": {
1197
+ "content": "<|_placeholder_148|>",
1198
+ "lstrip": false,
1199
+ "normalized": false,
1200
+ "rstrip": false,
1201
+ "single_word": false,
1202
+ "special": true
1203
+ },
1204
+ "128149": {
1205
+ "content": "<|_placeholder_149|>",
1206
+ "lstrip": false,
1207
+ "normalized": false,
1208
+ "rstrip": false,
1209
+ "single_word": false,
1210
+ "special": true
1211
+ },
1212
+ "128150": {
1213
+ "content": "<|_placeholder_150|>",
1214
+ "lstrip": false,
1215
+ "normalized": false,
1216
+ "rstrip": false,
1217
+ "single_word": false,
1218
+ "special": true
1219
+ },
1220
+ "128151": {
1221
+ "content": "<|_placeholder_151|>",
1222
+ "lstrip": false,
1223
+ "normalized": false,
1224
+ "rstrip": false,
1225
+ "single_word": false,
1226
+ "special": true
1227
+ },
1228
+ "128152": {
1229
+ "content": "<|_placeholder_152|>",
1230
+ "lstrip": false,
1231
+ "normalized": false,
1232
+ "rstrip": false,
1233
+ "single_word": false,
1234
+ "special": true
1235
+ },
1236
+ "128153": {
1237
+ "content": "<|_placeholder_153|>",
1238
+ "lstrip": false,
1239
+ "normalized": false,
1240
+ "rstrip": false,
1241
+ "single_word": false,
1242
+ "special": true
1243
+ },
1244
+ "128154": {
1245
+ "content": "<|_placeholder_154|>",
1246
+ "lstrip": false,
1247
+ "normalized": false,
1248
+ "rstrip": false,
1249
+ "single_word": false,
1250
+ "special": true
1251
+ },
1252
+ "128155": {
1253
+ "content": "<|_placeholder_155|>",
1254
+ "lstrip": false,
1255
+ "normalized": false,
1256
+ "rstrip": false,
1257
+ "single_word": false,
1258
+ "special": true
1259
+ },
1260
+ "128156": {
1261
+ "content": "<|_placeholder_156|>",
1262
+ "lstrip": false,
1263
+ "normalized": false,
1264
+ "rstrip": false,
1265
+ "single_word": false,
1266
+ "special": true
1267
+ },
1268
+ "128157": {
1269
+ "content": "<|_placeholder_157|>",
1270
+ "lstrip": false,
1271
+ "normalized": false,
1272
+ "rstrip": false,
1273
+ "single_word": false,
1274
+ "special": true
1275
+ },
1276
+ "128158": {
1277
+ "content": "<|_placeholder_158|>",
1278
+ "lstrip": false,
1279
+ "normalized": false,
1280
+ "rstrip": false,
1281
+ "single_word": false,
1282
+ "special": true
1283
+ },
1284
+ "128159": {
1285
+ "content": "<|_placeholder_159|>",
1286
+ "lstrip": false,
1287
+ "normalized": false,
1288
+ "rstrip": false,
1289
+ "single_word": false,
1290
+ "special": true
1291
+ },
1292
+ "128160": {
1293
+ "content": "<|_placeholder_160|>",
1294
+ "lstrip": false,
1295
+ "normalized": false,
1296
+ "rstrip": false,
1297
+ "single_word": false,
1298
+ "special": true
1299
+ },
1300
+ "128161": {
1301
+ "content": "<|_placeholder_161|>",
1302
+ "lstrip": false,
1303
+ "normalized": false,
1304
+ "rstrip": false,
1305
+ "single_word": false,
1306
+ "special": true
1307
+ },
1308
+ "128162": {
1309
+ "content": "<|_placeholder_162|>",
1310
+ "lstrip": false,
1311
+ "normalized": false,
1312
+ "rstrip": false,
1313
+ "single_word": false,
1314
+ "special": true
1315
+ },
1316
+ "128163": {
1317
+ "content": "<|_placeholder_163|>",
1318
+ "lstrip": false,
1319
+ "normalized": false,
1320
+ "rstrip": false,
1321
+ "single_word": false,
1322
+ "special": true
1323
+ },
1324
+ "128164": {
1325
+ "content": "<|_placeholder_164|>",
1326
+ "lstrip": false,
1327
+ "normalized": false,
1328
+ "rstrip": false,
1329
+ "single_word": false,
1330
+ "special": true
1331
+ },
1332
+ "128165": {
1333
+ "content": "<|_placeholder_165|>",
1334
+ "lstrip": false,
1335
+ "normalized": false,
1336
+ "rstrip": false,
1337
+ "single_word": false,
1338
+ "special": true
1339
+ },
1340
+ "128166": {
1341
+ "content": "<|_placeholder_166|>",
1342
+ "lstrip": false,
1343
+ "normalized": false,
1344
+ "rstrip": false,
1345
+ "single_word": false,
1346
+ "special": true
1347
+ },
1348
+ "128167": {
1349
+ "content": "<|_placeholder_167|>",
1350
+ "lstrip": false,
1351
+ "normalized": false,
1352
+ "rstrip": false,
1353
+ "single_word": false,
1354
+ "special": true
1355
+ },
1356
+ "128168": {
1357
+ "content": "<|_placeholder_168|>",
1358
+ "lstrip": false,
1359
+ "normalized": false,
1360
+ "rstrip": false,
1361
+ "single_word": false,
1362
+ "special": true
1363
+ },
1364
+ "128169": {
1365
+ "content": "<|_placeholder_169|>",
1366
+ "lstrip": false,
1367
+ "normalized": false,
1368
+ "rstrip": false,
1369
+ "single_word": false,
1370
+ "special": true
1371
+ },
1372
+ "128170": {
1373
+ "content": "<|_placeholder_170|>",
1374
+ "lstrip": false,
1375
+ "normalized": false,
1376
+ "rstrip": false,
1377
+ "single_word": false,
1378
+ "special": true
1379
+ },
1380
+ "128171": {
1381
+ "content": "<|_placeholder_171|>",
1382
+ "lstrip": false,
1383
+ "normalized": false,
1384
+ "rstrip": false,
1385
+ "single_word": false,
1386
+ "special": true
1387
+ },
1388
+ "128172": {
1389
+ "content": "<|_placeholder_172|>",
1390
+ "lstrip": false,
1391
+ "normalized": false,
1392
+ "rstrip": false,
1393
+ "single_word": false,
1394
+ "special": true
1395
+ },
1396
+ "128173": {
1397
+ "content": "<|_placeholder_173|>",
1398
+ "lstrip": false,
1399
+ "normalized": false,
1400
+ "rstrip": false,
1401
+ "single_word": false,
1402
+ "special": true
1403
+ },
1404
+ "128174": {
1405
+ "content": "<|_placeholder_174|>",
1406
+ "lstrip": false,
1407
+ "normalized": false,
1408
+ "rstrip": false,
1409
+ "single_word": false,
1410
+ "special": true
1411
+ },
1412
+ "128175": {
1413
+ "content": "<|_placeholder_175|>",
1414
+ "lstrip": false,
1415
+ "normalized": false,
1416
+ "rstrip": false,
1417
+ "single_word": false,
1418
+ "special": true
1419
+ },
1420
+ "128176": {
1421
+ "content": "<|_placeholder_176|>",
1422
+ "lstrip": false,
1423
+ "normalized": false,
1424
+ "rstrip": false,
1425
+ "single_word": false,
1426
+ "special": true
1427
+ },
1428
+ "128177": {
1429
+ "content": "<|_placeholder_177|>",
1430
+ "lstrip": false,
1431
+ "normalized": false,
1432
+ "rstrip": false,
1433
+ "single_word": false,
1434
+ "special": true
1435
+ },
1436
+ "128178": {
1437
+ "content": "<|_placeholder_178|>",
1438
+ "lstrip": false,
1439
+ "normalized": false,
1440
+ "rstrip": false,
1441
+ "single_word": false,
1442
+ "special": true
1443
+ },
1444
+ "128179": {
1445
+ "content": "<|_placeholder_179|>",
1446
+ "lstrip": false,
1447
+ "normalized": false,
1448
+ "rstrip": false,
1449
+ "single_word": false,
1450
+ "special": true
1451
+ },
1452
+ "128180": {
1453
+ "content": "<|_placeholder_180|>",
1454
+ "lstrip": false,
1455
+ "normalized": false,
1456
+ "rstrip": false,
1457
+ "single_word": false,
1458
+ "special": true
1459
+ },
1460
+ "128181": {
1461
+ "content": "<|_placeholder_181|>",
1462
+ "lstrip": false,
1463
+ "normalized": false,
1464
+ "rstrip": false,
1465
+ "single_word": false,
1466
+ "special": true
1467
+ },
1468
+ "128182": {
1469
+ "content": "<|_placeholder_182|>",
1470
+ "lstrip": false,
1471
+ "normalized": false,
1472
+ "rstrip": false,
1473
+ "single_word": false,
1474
+ "special": true
1475
+ },
1476
+ "128183": {
1477
+ "content": "<|_placeholder_183|>",
1478
+ "lstrip": false,
1479
+ "normalized": false,
1480
+ "rstrip": false,
1481
+ "single_word": false,
1482
+ "special": true
1483
+ },
1484
+ "128184": {
1485
+ "content": "<|_placeholder_184|>",
1486
+ "lstrip": false,
1487
+ "normalized": false,
1488
+ "rstrip": false,
1489
+ "single_word": false,
1490
+ "special": true
1491
+ },
1492
+ "128185": {
1493
+ "content": "<|_placeholder_185|>",
1494
+ "lstrip": false,
1495
+ "normalized": false,
1496
+ "rstrip": false,
1497
+ "single_word": false,
1498
+ "special": true
1499
+ },
1500
+ "128186": {
1501
+ "content": "<|_placeholder_186|>",
1502
+ "lstrip": false,
1503
+ "normalized": false,
1504
+ "rstrip": false,
1505
+ "single_word": false,
1506
+ "special": true
1507
+ },
1508
+ "128187": {
1509
+ "content": "<|_placeholder_187|>",
1510
+ "lstrip": false,
1511
+ "normalized": false,
1512
+ "rstrip": false,
1513
+ "single_word": false,
1514
+ "special": true
1515
+ },
1516
+ "128188": {
1517
+ "content": "<|_placeholder_188|>",
1518
+ "lstrip": false,
1519
+ "normalized": false,
1520
+ "rstrip": false,
1521
+ "single_word": false,
1522
+ "special": true
1523
+ },
1524
+ "128189": {
1525
+ "content": "<|_placeholder_189|>",
1526
+ "lstrip": false,
1527
+ "normalized": false,
1528
+ "rstrip": false,
1529
+ "single_word": false,
1530
+ "special": true
1531
+ },
1532
+ "128190": {
1533
+ "content": "<|_placeholder_190|>",
1534
+ "lstrip": false,
1535
+ "normalized": false,
1536
+ "rstrip": false,
1537
+ "single_word": false,
1538
+ "special": true
1539
+ },
1540
+ "128191": {
1541
+ "content": "<|_placeholder_191|>",
1542
+ "lstrip": false,
1543
+ "normalized": false,
1544
+ "rstrip": false,
1545
+ "single_word": false,
1546
+ "special": true
1547
+ },
1548
+ "128192": {
1549
+ "content": "<|_placeholder_192|>",
1550
+ "lstrip": false,
1551
+ "normalized": false,
1552
+ "rstrip": false,
1553
+ "single_word": false,
1554
+ "special": true
1555
+ },
1556
+ "128193": {
1557
+ "content": "<|_placeholder_193|>",
1558
+ "lstrip": false,
1559
+ "normalized": false,
1560
+ "rstrip": false,
1561
+ "single_word": false,
1562
+ "special": true
1563
+ },
1564
+ "128194": {
1565
+ "content": "<|_placeholder_194|>",
1566
+ "lstrip": false,
1567
+ "normalized": false,
1568
+ "rstrip": false,
1569
+ "single_word": false,
1570
+ "special": true
1571
+ },
1572
+ "128195": {
1573
+ "content": "<|_placeholder_195|>",
1574
+ "lstrip": false,
1575
+ "normalized": false,
1576
+ "rstrip": false,
1577
+ "single_word": false,
1578
+ "special": true
1579
+ },
1580
+ "128196": {
1581
+ "content": "<|_placeholder_196|>",
1582
+ "lstrip": false,
1583
+ "normalized": false,
1584
+ "rstrip": false,
1585
+ "single_word": false,
1586
+ "special": true
1587
+ },
1588
+ "128197": {
1589
+ "content": "<|_placeholder_197|>",
1590
+ "lstrip": false,
1591
+ "normalized": false,
1592
+ "rstrip": false,
1593
+ "single_word": false,
1594
+ "special": true
1595
+ },
1596
+ "128198": {
1597
+ "content": "<|_placeholder_198|>",
1598
+ "lstrip": false,
1599
+ "normalized": false,
1600
+ "rstrip": false,
1601
+ "single_word": false,
1602
+ "special": true
1603
+ },
1604
+ "128199": {
1605
+ "content": "<|_placeholder_199|>",
1606
+ "lstrip": false,
1607
+ "normalized": false,
1608
+ "rstrip": false,
1609
+ "single_word": false,
1610
+ "special": true
1611
+ },
1612
+ "128200": {
1613
+ "content": "<|_placeholder_200|>",
1614
+ "lstrip": false,
1615
+ "normalized": false,
1616
+ "rstrip": false,
1617
+ "single_word": false,
1618
+ "special": true
1619
+ },
1620
+ "128201": {
1621
+ "content": "<|_placeholder_201|>",
1622
+ "lstrip": false,
1623
+ "normalized": false,
1624
+ "rstrip": false,
1625
+ "single_word": false,
1626
+ "special": true
1627
+ },
1628
+ "128202": {
1629
+ "content": "<|_placeholder_202|>",
1630
+ "lstrip": false,
1631
+ "normalized": false,
1632
+ "rstrip": false,
1633
+ "single_word": false,
1634
+ "special": true
1635
+ },
1636
+ "128203": {
1637
+ "content": "<|_placeholder_203|>",
1638
+ "lstrip": false,
1639
+ "normalized": false,
1640
+ "rstrip": false,
1641
+ "single_word": false,
1642
+ "special": true
1643
+ },
1644
+ "128204": {
1645
+ "content": "<|_placeholder_204|>",
1646
+ "lstrip": false,
1647
+ "normalized": false,
1648
+ "rstrip": false,
1649
+ "single_word": false,
1650
+ "special": true
1651
+ },
1652
+ "128205": {
1653
+ "content": "<|_placeholder_205|>",
1654
+ "lstrip": false,
1655
+ "normalized": false,
1656
+ "rstrip": false,
1657
+ "single_word": false,
1658
+ "special": true
1659
+ },
1660
+ "128206": {
1661
+ "content": "<|_placeholder_206|>",
1662
+ "lstrip": false,
1663
+ "normalized": false,
1664
+ "rstrip": false,
1665
+ "single_word": false,
1666
+ "special": true
1667
+ },
1668
+ "128207": {
1669
+ "content": "<|_placeholder_207|>",
1670
+ "lstrip": false,
1671
+ "normalized": false,
1672
+ "rstrip": false,
1673
+ "single_word": false,
1674
+ "special": true
1675
+ },
1676
+ "128208": {
1677
+ "content": "<|_placeholder_208|>",
1678
+ "lstrip": false,
1679
+ "normalized": false,
1680
+ "rstrip": false,
1681
+ "single_word": false,
1682
+ "special": true
1683
+ },
1684
+ "128209": {
1685
+ "content": "<|_placeholder_209|>",
1686
+ "lstrip": false,
1687
+ "normalized": false,
1688
+ "rstrip": false,
1689
+ "single_word": false,
1690
+ "special": true
1691
+ },
1692
+ "128210": {
1693
+ "content": "<|_placeholder_210|>",
1694
+ "lstrip": false,
1695
+ "normalized": false,
1696
+ "rstrip": false,
1697
+ "single_word": false,
1698
+ "special": true
1699
+ },
1700
+ "128211": {
1701
+ "content": "<|_placeholder_211|>",
1702
+ "lstrip": false,
1703
+ "normalized": false,
1704
+ "rstrip": false,
1705
+ "single_word": false,
1706
+ "special": true
1707
+ },
1708
+ "128212": {
1709
+ "content": "<|_placeholder_212|>",
1710
+ "lstrip": false,
1711
+ "normalized": false,
1712
+ "rstrip": false,
1713
+ "single_word": false,
1714
+ "special": true
1715
+ },
1716
+ "128213": {
1717
+ "content": "<|_placeholder_213|>",
1718
+ "lstrip": false,
1719
+ "normalized": false,
1720
+ "rstrip": false,
1721
+ "single_word": false,
1722
+ "special": true
1723
+ },
1724
+ "128214": {
1725
+ "content": "<|_placeholder_214|>",
1726
+ "lstrip": false,
1727
+ "normalized": false,
1728
+ "rstrip": false,
1729
+ "single_word": false,
1730
+ "special": true
1731
+ },
1732
+ "128215": {
1733
+ "content": "<|_placeholder_215|>",
1734
+ "lstrip": false,
1735
+ "normalized": false,
1736
+ "rstrip": false,
1737
+ "single_word": false,
1738
+ "special": true
1739
+ },
1740
+ "128216": {
1741
+ "content": "<|_placeholder_216|>",
1742
+ "lstrip": false,
1743
+ "normalized": false,
1744
+ "rstrip": false,
1745
+ "single_word": false,
1746
+ "special": true
1747
+ },
1748
+ "128217": {
1749
+ "content": "<|_placeholder_217|>",
1750
+ "lstrip": false,
1751
+ "normalized": false,
1752
+ "rstrip": false,
1753
+ "single_word": false,
1754
+ "special": true
1755
+ },
1756
+ "128218": {
1757
+ "content": "<|_placeholder_218|>",
1758
+ "lstrip": false,
1759
+ "normalized": false,
1760
+ "rstrip": false,
1761
+ "single_word": false,
1762
+ "special": true
1763
+ },
1764
+ "128219": {
1765
+ "content": "<|_placeholder_219|>",
1766
+ "lstrip": false,
1767
+ "normalized": false,
1768
+ "rstrip": false,
1769
+ "single_word": false,
1770
+ "special": true
1771
+ },
1772
+ "128220": {
1773
+ "content": "<|_placeholder_220|>",
1774
+ "lstrip": false,
1775
+ "normalized": false,
1776
+ "rstrip": false,
1777
+ "single_word": false,
1778
+ "special": true
1779
+ },
1780
+ "128221": {
1781
+ "content": "<|_placeholder_221|>",
1782
+ "lstrip": false,
1783
+ "normalized": false,
1784
+ "rstrip": false,
1785
+ "single_word": false,
1786
+ "special": true
1787
+ },
1788
+ "128222": {
1789
+ "content": "<|_placeholder_222|>",
1790
+ "lstrip": false,
1791
+ "normalized": false,
1792
+ "rstrip": false,
1793
+ "single_word": false,
1794
+ "special": true
1795
+ },
1796
+ "128223": {
1797
+ "content": "<|_placeholder_223|>",
1798
+ "lstrip": false,
1799
+ "normalized": false,
1800
+ "rstrip": false,
1801
+ "single_word": false,
1802
+ "special": true
1803
+ },
1804
+ "128224": {
1805
+ "content": "<|_placeholder_224|>",
1806
+ "lstrip": false,
1807
+ "normalized": false,
1808
+ "rstrip": false,
1809
+ "single_word": false,
1810
+ "special": true
1811
+ },
1812
+ "128225": {
1813
+ "content": "<|_placeholder_225|>",
1814
+ "lstrip": false,
1815
+ "normalized": false,
1816
+ "rstrip": false,
1817
+ "single_word": false,
1818
+ "special": true
1819
+ },
1820
+ "128226": {
1821
+ "content": "<|_placeholder_226|>",
1822
+ "lstrip": false,
1823
+ "normalized": false,
1824
+ "rstrip": false,
1825
+ "single_word": false,
1826
+ "special": true
1827
+ },
1828
+ "128227": {
1829
+ "content": "<|_placeholder_227|>",
1830
+ "lstrip": false,
1831
+ "normalized": false,
1832
+ "rstrip": false,
1833
+ "single_word": false,
1834
+ "special": true
1835
+ },
1836
+ "128228": {
1837
+ "content": "<|_placeholder_228|>",
1838
+ "lstrip": false,
1839
+ "normalized": false,
1840
+ "rstrip": false,
1841
+ "single_word": false,
1842
+ "special": true
1843
+ },
1844
+ "128229": {
1845
+ "content": "<|_placeholder_229|>",
1846
+ "lstrip": false,
1847
+ "normalized": false,
1848
+ "rstrip": false,
1849
+ "single_word": false,
1850
+ "special": true
1851
+ },
1852
+ "128230": {
1853
+ "content": "<|_placeholder_230|>",
1854
+ "lstrip": false,
1855
+ "normalized": false,
1856
+ "rstrip": false,
1857
+ "single_word": false,
1858
+ "special": true
1859
+ },
1860
+ "128231": {
1861
+ "content": "<|_placeholder_231|>",
1862
+ "lstrip": false,
1863
+ "normalized": false,
1864
+ "rstrip": false,
1865
+ "single_word": false,
1866
+ "special": true
1867
+ },
1868
+ "128232": {
1869
+ "content": "<|_placeholder_232|>",
1870
+ "lstrip": false,
1871
+ "normalized": false,
1872
+ "rstrip": false,
1873
+ "single_word": false,
1874
+ "special": true
1875
+ },
1876
+ "128233": {
1877
+ "content": "<|_placeholder_233|>",
1878
+ "lstrip": false,
1879
+ "normalized": false,
1880
+ "rstrip": false,
1881
+ "single_word": false,
1882
+ "special": true
1883
+ },
1884
+ "128234": {
1885
+ "content": "<|_placeholder_234|>",
1886
+ "lstrip": false,
1887
+ "normalized": false,
1888
+ "rstrip": false,
1889
+ "single_word": false,
1890
+ "special": true
1891
+ },
1892
+ "128235": {
1893
+ "content": "<|_placeholder_235|>",
1894
+ "lstrip": false,
1895
+ "normalized": false,
1896
+ "rstrip": false,
1897
+ "single_word": false,
1898
+ "special": true
1899
+ },
1900
+ "128236": {
1901
+ "content": "<|_placeholder_236|>",
1902
+ "lstrip": false,
1903
+ "normalized": false,
1904
+ "rstrip": false,
1905
+ "single_word": false,
1906
+ "special": true
1907
+ },
1908
+ "128237": {
1909
+ "content": "<|_placeholder_237|>",
1910
+ "lstrip": false,
1911
+ "normalized": false,
1912
+ "rstrip": false,
1913
+ "single_word": false,
1914
+ "special": true
1915
+ },
1916
+ "128238": {
1917
+ "content": "<|_placeholder_238|>",
1918
+ "lstrip": false,
1919
+ "normalized": false,
1920
+ "rstrip": false,
1921
+ "single_word": false,
1922
+ "special": true
1923
+ },
1924
+ "128239": {
1925
+ "content": "<|_placeholder_239|>",
1926
+ "lstrip": false,
1927
+ "normalized": false,
1928
+ "rstrip": false,
1929
+ "single_word": false,
1930
+ "special": true
1931
+ },
1932
+ "128240": {
1933
+ "content": "<|_placeholder_240|>",
1934
+ "lstrip": false,
1935
+ "normalized": false,
1936
+ "rstrip": false,
1937
+ "single_word": false,
1938
+ "special": true
1939
+ },
1940
+ "128241": {
1941
+ "content": "<|_placeholder_241|>",
1942
+ "lstrip": false,
1943
+ "normalized": false,
1944
+ "rstrip": false,
1945
+ "single_word": false,
1946
+ "special": true
1947
+ },
1948
+ "128242": {
1949
+ "content": "<|_placeholder_242|>",
1950
+ "lstrip": false,
1951
+ "normalized": false,
1952
+ "rstrip": false,
1953
+ "single_word": false,
1954
+ "special": true
1955
+ },
1956
+ "128243": {
1957
+ "content": "<|_placeholder_243|>",
1958
+ "lstrip": false,
1959
+ "normalized": false,
1960
+ "rstrip": false,
1961
+ "single_word": false,
1962
+ "special": true
1963
+ },
1964
+ "128244": {
1965
+ "content": "<|_placeholder_244|>",
1966
+ "lstrip": false,
1967
+ "normalized": false,
1968
+ "rstrip": false,
1969
+ "single_word": false,
1970
+ "special": true
1971
+ },
1972
+ "128245": {
1973
+ "content": "<|_placeholder_245|>",
1974
+ "lstrip": false,
1975
+ "normalized": false,
1976
+ "rstrip": false,
1977
+ "single_word": false,
1978
+ "special": true
1979
+ },
1980
+ "128246": {
1981
+ "content": "<|_placeholder_246|>",
1982
+ "lstrip": false,
1983
+ "normalized": false,
1984
+ "rstrip": false,
1985
+ "single_word": false,
1986
+ "special": true
1987
+ },
1988
+ "128247": {
1989
+ "content": "<|_placeholder_247|>",
1990
+ "lstrip": false,
1991
+ "normalized": false,
1992
+ "rstrip": false,
1993
+ "single_word": false,
1994
+ "special": true
1995
+ },
1996
+ "128248": {
1997
+ "content": "<|_placeholder_248|>",
1998
+ "lstrip": false,
1999
+ "normalized": false,
2000
+ "rstrip": false,
2001
+ "single_word": false,
2002
+ "special": true
2003
+ },
2004
+ "128249": {
2005
+ "content": "<|_placeholder_249|>",
2006
+ "lstrip": false,
2007
+ "normalized": false,
2008
+ "rstrip": false,
2009
+ "single_word": false,
2010
+ "special": true
2011
+ },
2012
+ "128250": {
2013
+ "content": "<|_placeholder_250|>",
2014
+ "lstrip": false,
2015
+ "normalized": false,
2016
+ "rstrip": false,
2017
+ "single_word": false,
2018
+ "special": true
2019
+ },
2020
+ "128251": {
2021
+ "content": "<|_placeholder_251|>",
2022
+ "lstrip": false,
2023
+ "normalized": false,
2024
+ "rstrip": false,
2025
+ "single_word": false,
2026
+ "special": true
2027
+ },
2028
+ "128252": {
2029
+ "content": "<|_placeholder_252|>",
2030
+ "lstrip": false,
2031
+ "normalized": false,
2032
+ "rstrip": false,
2033
+ "single_word": false,
2034
+ "special": true
2035
+ },
2036
+ "128253": {
2037
+ "content": "<|_placeholder_253|>",
2038
+ "lstrip": false,
2039
+ "normalized": false,
2040
+ "rstrip": false,
2041
+ "single_word": false,
2042
+ "special": true
2043
+ },
2044
+ "128254": {
2045
+ "content": "<|_placeholder_254|>",
2046
+ "lstrip": false,
2047
+ "normalized": false,
2048
+ "rstrip": false,
2049
+ "single_word": false,
2050
+ "special": true
2051
+ },
2052
+ "128255": {
2053
+ "content": "<|_placeholder_255|>",
2054
+ "lstrip": false,
2055
+ "normalized": false,
2056
+ "rstrip": false,
2057
+ "single_word": false,
2058
+ "special": true
2059
+ }
2060
+ },
2061
+ "auto_map": {
2062
+ "AutoProcessor": "processing_vlm.HCXVisionV2Processor"
2063
+ },
2064
+ "bos_token": "<|endoftext|>",
2065
+ "clean_up_tokenization_spaces": true,
2066
+ "eos_token": "<|im_end|>",
2067
+ "extra_special_tokens": {
2068
+ "image_token": "<|IMAGE_PAD|>",
2069
+ "video_token": "<|VIDEO_PAD|>"
2070
+ },
2071
+ "image_token": "<|IMAGE_PAD|>",
2072
+ "model_max_length": 1000000000000000019884624838656,
2073
+ "pad_token": "<|endoftext|>",
2074
+ "processor_class": "HCXVisionV2Processor",
2075
+ "sep_token": "<|endoftext|>",
2076
+ "tokenizer_class": "GPT2Tokenizer",
2077
+ "unk_token": "<|endoftext|>",
2078
+ "video_token": "<|VIDEO_PAD|>"
2079
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff