shentao.scott commited on
Commit
ede5526
·
0 Parent(s):

track large files with LFS

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% if t2i_generate %}<|im_start|>system
2
+ You are a helpful image generator.<|im_end|>
3
+ {% else %}<|im_start|>system
4
+ You are a helpful assistant.<|im_end|>
5
+ {% endif %}{% endif %}<|im_start|>{{ message['role'] }}
6
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
7
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
8
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
9
+ {% if t2i_generate %}<|image start|>{{ ar_width }}*{{ ar_height }}<|image token|>{% endif %}
10
+ {% endif %}
config.json ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Mammothmoda2Model"
4
+ ],
5
+ "gen_axes_dim_rope": [
6
+ 40,
7
+ 40,
8
+ 40
9
+ ],
10
+ "gen_axes_lens": [
11
+ 10000,
12
+ 10000,
13
+ 10000
14
+ ],
15
+ "gen_condition_mode": "text_image",
16
+ "gen_dit_config": {
17
+ "_class_name": "OmniGen2Transformer2DModel",
18
+ "_diffusers_version": "0.33.1",
19
+ "axes_dim_rope": [
20
+ 40,
21
+ 40,
22
+ 40
23
+ ],
24
+ "axes_lens": [
25
+ 1024,
26
+ 1664,
27
+ 1664
28
+ ],
29
+ "ffn_dim_multiplier": null,
30
+ "hidden_size": 2520,
31
+ "in_channels": 16,
32
+ "multiple_of": 256,
33
+ "norm_eps": 1e-05,
34
+ "num_attention_heads": 21,
35
+ "num_kv_heads": 7,
36
+ "num_layers": 16,
37
+ "num_refiner_layers": 2,
38
+ "out_channels": null,
39
+ "patch_size": 2,
40
+ "text_feat_dim": 2048,
41
+ "timestep_scale": 1000.0
42
+ },
43
+ "gen_image_condition_refiner_config": {
44
+ "num_layers": 2,
45
+ "num_queries": 128
46
+ },
47
+ "gen_transport_config": {
48
+ "do_shift": true,
49
+ "seq_len": 4096,
50
+ "snr_type": "lognorm"
51
+ },
52
+ "gen_vae_config": {
53
+ "_class_name": "AutoencoderKL",
54
+ "_diffusers_version": "0.30.0.dev0",
55
+ "_name_or_path": "../checkpoints/flux-dev",
56
+ "act_fn": "silu",
57
+ "block_out_channels": [
58
+ 128,
59
+ 256,
60
+ 512,
61
+ 512
62
+ ],
63
+ "down_block_types": [
64
+ "DownEncoderBlock2D",
65
+ "DownEncoderBlock2D",
66
+ "DownEncoderBlock2D",
67
+ "DownEncoderBlock2D"
68
+ ],
69
+ "force_upcast": true,
70
+ "in_channels": 3,
71
+ "latent_channels": 16,
72
+ "latents_mean": null,
73
+ "latents_std": null,
74
+ "layers_per_block": 2,
75
+ "mid_block_add_attention": true,
76
+ "norm_num_groups": 32,
77
+ "out_channels": 3,
78
+ "sample_size": 1024,
79
+ "scaling_factor": 0.3611,
80
+ "shift_factor": 0.1159,
81
+ "up_block_types": [
82
+ "UpDecoderBlock2D",
83
+ "UpDecoderBlock2D",
84
+ "UpDecoderBlock2D",
85
+ "UpDecoderBlock2D"
86
+ ],
87
+ "use_post_quant_conv": false,
88
+ "use_quant_conv": false
89
+ },
90
+ "initializer_range": 0.02,
91
+ "llm_config": {
92
+ "attention_dropout": 0.0,
93
+ "bos_token_id": 151643,
94
+ "eos_token_id": 151645,
95
+ "extra_gen_vocab": true,
96
+ "gen_vocab_size": 32800,
97
+ "gen_vocab_start_index": 152064,
98
+ "hidden_act": "silu",
99
+ "hidden_size": 3584,
100
+ "image_token_id": 151655,
101
+ "initializer_range": 0.02,
102
+ "intermediate_size": 18944,
103
+ "max_position_embeddings": 128000,
104
+ "max_window_layers": 28,
105
+ "model_type": "mammothmoda2_qwen2_5_vl",
106
+ "moe_type": "ffn-14:28",
107
+ "num_attention_heads": 28,
108
+ "num_hidden_layers": 28,
109
+ "num_key_value_heads": 4,
110
+ "rms_norm_eps": 1e-06,
111
+ "rope_scaling": {
112
+ "mrope_section": [
113
+ 16,
114
+ 24,
115
+ 24
116
+ ],
117
+ "rope_type": "default",
118
+ "type": "default"
119
+ },
120
+ "rope_theta": 1000000.0,
121
+ "sliding_window": 32768,
122
+ "text_config": {
123
+ "_name_or_path": "",
124
+ "add_cross_attention": false,
125
+ "architectures": [
126
+ "Qwen2_5_VLForConditionalGeneration"
127
+ ],
128
+ "attention_dropout": 0.0,
129
+ "bad_words_ids": null,
130
+ "begin_suppress_tokens": null,
131
+ "bos_token_id": 151643,
132
+ "chunk_size_feed_forward": 0,
133
+ "cross_attention_hidden_size": null,
134
+ "decoder_start_token_id": null,
135
+ "diversity_penalty": 0.0,
136
+ "do_sample": false,
137
+ "early_stopping": false,
138
+ "encoder_no_repeat_ngram_size": 0,
139
+ "eos_token_id": 151645,
140
+ "exponential_decay_length_penalty": null,
141
+ "extra_gen_vocab": true,
142
+ "finetuning_task": null,
143
+ "forced_bos_token_id": null,
144
+ "forced_eos_token_id": null,
145
+ "gen_vocab_size": 32800,
146
+ "gen_vocab_start_index": 152064,
147
+ "hidden_act": "silu",
148
+ "hidden_size": 3584,
149
+ "id2label": {
150
+ "0": "LABEL_0",
151
+ "1": "LABEL_1"
152
+ },
153
+ "image_token_id": null,
154
+ "initializer_range": 0.02,
155
+ "intermediate_size": 18944,
156
+ "is_decoder": false,
157
+ "is_encoder_decoder": false,
158
+ "label2id": {
159
+ "LABEL_0": 0,
160
+ "LABEL_1": 1
161
+ },
162
+ "layer_types": [
163
+ "full_attention",
164
+ "full_attention",
165
+ "full_attention",
166
+ "full_attention",
167
+ "full_attention",
168
+ "full_attention",
169
+ "full_attention",
170
+ "full_attention",
171
+ "full_attention",
172
+ "full_attention",
173
+ "full_attention",
174
+ "full_attention",
175
+ "full_attention",
176
+ "full_attention",
177
+ "full_attention",
178
+ "full_attention",
179
+ "full_attention",
180
+ "full_attention",
181
+ "full_attention",
182
+ "full_attention",
183
+ "full_attention",
184
+ "full_attention",
185
+ "full_attention",
186
+ "full_attention",
187
+ "full_attention",
188
+ "full_attention",
189
+ "full_attention",
190
+ "full_attention"
191
+ ],
192
+ "length_penalty": 1.0,
193
+ "max_length": 20,
194
+ "max_position_embeddings": 128000,
195
+ "max_window_layers": 28,
196
+ "min_length": 0,
197
+ "model_type": "mammothmoda2_qwen2_5_vl_text",
198
+ "moe_type": "ffn-14:28",
199
+ "no_repeat_ngram_size": 0,
200
+ "num_attention_heads": 28,
201
+ "num_beam_groups": 1,
202
+ "num_beams": 1,
203
+ "num_hidden_layers": 28,
204
+ "num_key_value_heads": 4,
205
+ "num_return_sequences": 1,
206
+ "output_attentions": false,
207
+ "output_hidden_states": false,
208
+ "output_scores": false,
209
+ "pad_token_id": null,
210
+ "prefix": null,
211
+ "problem_type": null,
212
+ "pruned_heads": {},
213
+ "remove_invalid_values": false,
214
+ "repetition_penalty": 1.0,
215
+ "return_dict": true,
216
+ "return_dict_in_generate": false,
217
+ "rms_norm_eps": 1e-06,
218
+ "rope_scaling": {
219
+ "mrope_section": [
220
+ 16,
221
+ 24,
222
+ 24
223
+ ],
224
+ "rope_type": "default",
225
+ "type": "default"
226
+ },
227
+ "rope_theta": 1000000.0,
228
+ "sep_token_id": null,
229
+ "sliding_window": null,
230
+ "suppress_tokens": null,
231
+ "task_specific_params": null,
232
+ "temperature": 1.0,
233
+ "tf_legacy_loss": false,
234
+ "tie_encoder_decoder": false,
235
+ "tie_word_embeddings": false,
236
+ "tokenizer_class": null,
237
+ "top_k": 50,
238
+ "top_p": 1.0,
239
+ "torch_dtype": "bfloat16",
240
+ "torchscript": false,
241
+ "typical_p": 1.0,
242
+ "unpack_forward": true,
243
+ "use_bfloat16": false,
244
+ "use_cache": true,
245
+ "use_sliding_window": false,
246
+ "video_token_id": null,
247
+ "vision_end_token_id": 151653,
248
+ "vision_start_token_id": 151652,
249
+ "vision_token_id": 151654,
250
+ "vocab_size": 152064
251
+ },
252
+ "tie_word_embeddings": false,
253
+ "torch_dtype": "float32",
254
+ "unpack_forward": true,
255
+ "use_cache": true,
256
+ "use_sliding_window": false,
257
+ "video_token_id": 151656,
258
+ "vision_config": {
259
+ "_name_or_path": "",
260
+ "add_cross_attention": false,
261
+ "architectures": null,
262
+ "bad_words_ids": null,
263
+ "begin_suppress_tokens": null,
264
+ "bos_token_id": null,
265
+ "chunk_size_feed_forward": 0,
266
+ "cross_attention_hidden_size": null,
267
+ "decoder_start_token_id": null,
268
+ "depth": 32,
269
+ "diversity_penalty": 0.0,
270
+ "do_sample": false,
271
+ "early_stopping": false,
272
+ "encoder_no_repeat_ngram_size": 0,
273
+ "eos_token_id": null,
274
+ "exponential_decay_length_penalty": null,
275
+ "finetuning_task": null,
276
+ "forced_bos_token_id": null,
277
+ "forced_eos_token_id": null,
278
+ "fullatt_block_indexes": [
279
+ 7,
280
+ 15,
281
+ 23,
282
+ 31
283
+ ],
284
+ "hidden_act": "silu",
285
+ "hidden_size": 1280,
286
+ "id2label": {
287
+ "0": "LABEL_0",
288
+ "1": "LABEL_1"
289
+ },
290
+ "in_channels": 3,
291
+ "in_chans": 3,
292
+ "initializer_range": 0.02,
293
+ "intermediate_size": 3420,
294
+ "is_decoder": false,
295
+ "is_encoder_decoder": false,
296
+ "label2id": {
297
+ "LABEL_0": 0,
298
+ "LABEL_1": 1
299
+ },
300
+ "length_penalty": 1.0,
301
+ "max_length": 20,
302
+ "min_length": 0,
303
+ "model_type": "mammothmoda2_qwen2_5_vl",
304
+ "no_repeat_ngram_size": 0,
305
+ "num_beam_groups": 1,
306
+ "num_beams": 1,
307
+ "num_heads": 16,
308
+ "num_return_sequences": 1,
309
+ "out_hidden_size": 3584,
310
+ "output_attentions": false,
311
+ "output_hidden_states": false,
312
+ "output_scores": false,
313
+ "pad_token_id": null,
314
+ "patch_size": 14,
315
+ "prefix": null,
316
+ "problem_type": null,
317
+ "pruned_heads": {},
318
+ "remove_invalid_values": false,
319
+ "repetition_penalty": 1.0,
320
+ "return_dict": true,
321
+ "return_dict_in_generate": false,
322
+ "sep_token_id": null,
323
+ "spatial_merge_size": 2,
324
+ "spatial_patch_size": 14,
325
+ "suppress_tokens": null,
326
+ "task_specific_params": null,
327
+ "temperature": 1.0,
328
+ "temporal_patch_size": 2,
329
+ "tf_legacy_loss": false,
330
+ "tie_encoder_decoder": false,
331
+ "tie_word_embeddings": true,
332
+ "tokenizer_class": null,
333
+ "tokens_per_second": 2,
334
+ "top_k": 50,
335
+ "top_p": 1.0,
336
+ "torch_dtype": null,
337
+ "torchscript": false,
338
+ "typical_p": 1.0,
339
+ "use_bfloat16": false,
340
+ "window_size": 112
341
+ },
342
+ "vision_end_token_id": 151653,
343
+ "vision_start_token_id": 151652,
344
+ "vision_token_id": 151654,
345
+ "vocab_size": 152064
346
+ },
347
+ "llm_moe_init_gen_from_und": true,
348
+ "llm_pretrained_path": null,
349
+ "model_type": "mammothmoda2",
350
+ "torch_dtype": "float32",
351
+ "transformers_version": "4.53.2",
352
+ "unpack_forward": true,
353
+ "verbose_interval": 200,
354
+ "vis_freeze": true
355
+ }
mammothu.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
mammothu_vision_tokens.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 50176,
3
+ "max_pixels": 1605632,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor",
18
+ "processor_class": "Mammothmoda2Processor"
19
+ }
t2i_generation_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_beams": 1,
3
+ "do_sample": true,
4
+ "top_p": 1.0,
5
+ "top_k": 2048,
6
+ "temperature": 1.0,
7
+ "repetition_penalty": 1.0,
8
+ "max_new_tokens": 1000,
9
+ "bos_token_id": 151643,
10
+ "pad_token_id": 151643,
11
+ "eos_token_id": [
12
+ 151645,
13
+ 151643
14
+ ],
15
+ "eol_token_id": 152064,
16
+ "visual_token_start_id": 152072,
17
+ "visual_token_end_id": 168456
18
+ }
und_generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "max_new_tokens": 1000,
8
+ "pad_token_id": 151643,
9
+ "temperature": null,
10
+ "top_p": null,
11
+ "transformers_version": "4.53.2"
12
+ }