samsja commited on
Commit
ad6b392
·
verified ·
1 Parent(s): 2557902

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - prime-rl
5
+ - moe
6
+ - test-model
7
+ library_name: transformers
8
+ ---
9
+
10
+ <div align="center">
11
+ <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/61e020e4a343274bb132e138/H2mcdPRWtl4iKLd-OYYBc.jpeg" width="200"/>
12
+ </div>
13
+
14
+ # minimax-m2-tiny
15
+
16
+ A small (~252M parameter) MiniMax M2 MoE model for testing only. It is generally compatible with vLLM and HuggingFace Transformers but is meant to be used with [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl).
17
+
18
+ This model has random weights (no SFT warmup yet due to a chat template tokenization issue with MiniMax's tokenizer).
19
+
20
+ ## Quick Start
21
+
22
+ ```bash
23
+ uv run rl @ configs/ci/integration/rl_moe/minimax_m2.toml
24
+ ```
25
+
26
+ See the [Testing MoE at Small Scale](https://github.com/PrimeIntellect-ai/prime-rl/blob/main/docs/testing-moe-at-small-scale.md) guide for full instructions.
27
+
28
+ ## Model Details
29
+
30
+ | Parameter | Value |
31
+ |-----------|-------|
32
+ | Hidden size | 512 |
33
+ | Layers | 12 |
34
+ | Experts | 8 |
35
+ | Active experts | 4 |
36
+ | Parameters | ~252M |
37
+
38
+ ## Links
39
+
40
+ - [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl) - RL training framework
41
+ - [PrimeIntellect](https://www.primeintellect.ai/) - Building infrastructure for decentralized AI
chat_template.jinja ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# ----------‑‑‑ special token variables ‑‑‑---------- #}
2
+ {%- set toolcall_begin_token = '<minimax:tool_call>' -%}
3
+ {%- set toolcall_end_token = '</minimax:tool_call>' -%}
4
+ {#- Tool Rendering Functions ============================================== -#}
5
+ {%- macro render_tool_namespace(namespace_name, tool_list) -%}
6
+ {%- for tool in tool_list -%}
7
+ <tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>
8
+ {% endfor -%}
9
+ {%- endmacro -%}
10
+ {%- macro visible_text(content) -%}
11
+ {%- if content is string -%}
12
+ {{ content }}
13
+ {%- elif content is iterable and content is not mapping -%}
14
+ {%- for item in content -%}
15
+ {%- if item is mapping and item.type == 'text' -%}
16
+ {{- item.text }}
17
+ {%- elif item is string -%}
18
+ {{- item }}
19
+ {%- endif -%}
20
+ {%- endfor -%}
21
+ {%- elif content is none -%}
22
+ {{- '' }}
23
+ {%- else -%}
24
+ {{- content }}
25
+ {%- endif -%}
26
+ {%- endmacro -%}
27
+ {#- System Message Construction ============================================ -#}
28
+ {%- macro build_system_message(system_message) -%}
29
+ {%- if system_message and system_message.content -%}
30
+ {{- visible_text(system_message.content) }}
31
+ {%- else -%}
32
+ {%- if model_identity is not defined -%}
33
+ {%- set model_identity = "You are a helpful assistant. Your name is MiniMax-M2.1 and is built by MiniMax." -%}
34
+ {%- endif -%}
35
+ {{- model_identity }}
36
+ {%- endif -%}
37
+
38
+ {#- Handle current_date -#}
39
+ {%- if system_message and system_message.current_date -%}
40
+ {{- '\n' ~ 'Current date: ' + system_message.current_date }}
41
+ {%- endif -%}
42
+ {#- Handle current_location -#}
43
+ {%- if system_message and system_message.current_location -%}
44
+ {{- '\n' ~ 'Current location: ' + system_message.current_location }}
45
+ {%- endif -%}
46
+ {%- endmacro -%}
47
+ {#- Main Template Logic ================================================= -#}
48
+ {#- Extract system message (only first message if it's system) -#}
49
+ {%- set system_message = none -%}
50
+ {%- set conversation_messages = messages -%}
51
+ {%- if messages and messages[0].role == "system" -%}
52
+ {%- set system_message = messages[0] -%}
53
+ {%- set conversation_messages = messages[1:] -%}
54
+ {%- endif -%}
55
+ {#- Get the last user message turn, for interleved thinking -#}
56
+ {%- set ns = namespace(last_user_index=-1) %}
57
+ {% for m in conversation_messages %}
58
+ {%- if m.role == 'user' %}
59
+ {% set ns.last_user_index = loop.index0 -%}
60
+ {%- endif %}
61
+ {%- endfor %}
62
+ {#- Render system message -#}
63
+ {{- ']~!b[' ~ ']~b]system' ~ '\n' }}
64
+ {{- build_system_message(system_message) }}
65
+ {#- Render tools if available -#}
66
+ {%- if tools -%}
67
+ {{- '\n\n' ~ '# Tools' ~ '\n' ~ 'You may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:' ~ '\n' }}
68
+ {{- '\n' ~ '<tools>' ~ '\n' }}
69
+ {{- render_tool_namespace("functions", tools) }}
70
+ {{- '</tools>' ~ '\n\n' }}
71
+ {{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\n' }}
72
+ {{- '\n' ~ toolcall_begin_token }}
73
+ <invoke name="tool-name-1">
74
+ <parameter name="param-key-1">param-value-1</parameter>
75
+ <parameter name="param-key-2">param-value-2</parameter>
76
+ ...
77
+ </invoke>
78
+ {{- '\n' ~ toolcall_end_token }}
79
+ {%- endif -%}
80
+ {{- '[e~[\n' }}
81
+
82
+ {#- Render messages -#}
83
+ {%- set last_tool_call = namespace(name=none) -%}
84
+ {%- for message in conversation_messages -%}
85
+ {%- if message.role == 'assistant' -%}
86
+ {#- Only render reasoning_content if no user message follows -#}
87
+ {{- ']~b]ai' ~ '\n' }}
88
+
89
+ {%- set reasoning_content = '' %}
90
+ {%- set content = visible_text(message.content) %}
91
+ {%- if message.reasoning_content is string %}
92
+ {%- set reasoning_content = message.reasoning_content %}
93
+ {%- else %}
94
+ {%- if '</think>' in content %}
95
+ {%- set reasoning_content = content.split('</think>')[0].strip('\n').split('<think>')[-1].strip('\n') %}
96
+ {%- set content = content.split('</think>')[-1].strip('\n') %}
97
+ {%- endif %}
98
+ {%- endif %}
99
+ {%- if reasoning_content and loop.index0 > ns.last_user_index -%}
100
+ {{- '<think>' ~ '\n' ~ reasoning_content ~ '\n' ~ '</think>' ~ '\n\n' }}
101
+ {%- endif -%}
102
+ {%- if content -%}
103
+ {{- content }}
104
+ {%- endif -%}
105
+ {%- if message.tool_calls -%}
106
+ {{- '\n' ~ toolcall_begin_token ~ '\n' }}
107
+
108
+ {%- for tool_call in message.tool_calls -%}
109
+ {%- if tool_call.function %}
110
+ {%- set tool_call = tool_call.function %}
111
+ {%- endif %}
112
+ {{- '<invoke name="' + tool_call.name + '">' }}
113
+ {% set _args = tool_call.arguments %}
114
+ {%- for k, v in _args.items() %}
115
+ {{- '<parameter name="' + k + '">' }}
116
+ {{- v | tojson(ensure_ascii=False) if v is not string else v }}
117
+ {{- '</parameter>' }}
118
+ {% endfor %}
119
+ {{- '</invoke>' ~ '\n' }}
120
+ {%- endfor -%}
121
+
122
+ {{- toolcall_end_token}}
123
+ {%- if message.tool_calls[-1].function -%}
124
+ {%- set last_tool_call.name = message.tool_calls[-1].function.name -%}
125
+ {%- else -%}
126
+ {%- set last_tool_call.name = message.tool_calls[-1].name -%}
127
+ {%- endif -%}
128
+ {%- else -%}
129
+ {%- set last_tool_call.name = none -%}
130
+ {%- endif -%}
131
+ {{- '[e~[' ~ '\n' }}
132
+
133
+ {%- elif message.role == 'tool' -%}
134
+ {%- if last_tool_call.name is none -%}
135
+ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
136
+ {%- endif -%}
137
+ {%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}
138
+ {{- ']~b]tool' }}
139
+ {%- endif -%}
140
+ {%- if message.content is string -%}
141
+ {{- '\n<response>' }}
142
+ {{- message.content }}
143
+ {{- '</response>' }}
144
+ {%- else -%}
145
+ {%- for tr in message.content -%}
146
+ {{- '\n<response>' }}
147
+ {{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}
148
+ {{- '\n</response>' }}
149
+ {%- endfor -%}
150
+ {%- endif -%}
151
+ {%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}
152
+ {{- '[e~[\n' -}}
153
+ {%- endif -%}
154
+
155
+ {%- elif message.role == 'user' -%}
156
+ {{- ']~b]user' ~ '\n' }}
157
+ {{- visible_text(message.content) }}
158
+ {{- '[e~[' ~ '\n' }}
159
+ {%- endif -%}
160
+ {%- endfor -%}
161
+
162
+ {#- Generation prompt -#}
163
+ {%- if add_generation_prompt -%}
164
+ {{- ']~b]ai' ~ '\n' ~ '<think>' ~ '\n' }}
165
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MiniMaxM2ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 256,
15
+ "max_position_embeddings": 4096,
16
+ "model_type": "minimax_m2",
17
+ "num_attention_heads": 8,
18
+ "num_experts_per_tok": 4,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "num_local_experts": 8,
22
+ "output_router_logits": false,
23
+ "pad_token_id": null,
24
+ "partial_rotary_factor": 0.5,
25
+ "qk_norm_type": "per_layer",
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_parameters": {
28
+ "partial_rotary_factor": 0.5,
29
+ "rope_theta": 5000000,
30
+ "rope_type": "default"
31
+ },
32
+ "rope_theta": 5000000,
33
+ "rotary_dim": 32,
34
+ "router_aux_loss_coef": 0.001,
35
+ "router_jitter_noise": 0.0,
36
+ "scoring_func": "sigmoid",
37
+ "sliding_window": null,
38
+ "tie_word_embeddings": false,
39
+ "transformers_version": "5.2.0.dev0",
40
+ "use_cache": true,
41
+ "use_grouped_mm": false,
42
+ "use_qk_norm": true,
43
+ "use_routing_bias": true,
44
+ "vocab_size": 200064
45
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "transformers_version": "5.2.0.dev0",
8
+ "use_cache": true
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faa1ec86037a4158601bd23eda27f8fa7d44bdd720b73740e4cd38c6513b3158
3
+ size 1008540808
prime/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35f81f4a7dd58964273a9c0a9d44a6729767c0f9a087d56209c3d9892f0a18ce
3
+ size 1008508152
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b81e5e5cba2b169e86a0771825a927e9d41b4c4484ded4a286410f41f702f17
3
+ size 15523144
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "]~!b[",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "[e~[",
7
+ "extra_special_tokens": [
8
+ "<code_interpreter>",
9
+ "<commit_after>",
10
+ "<commit_before>",
11
+ "<commit_msg>",
12
+ "<empty_output>",
13
+ "<filename>",
14
+ "<fim_middle>",
15
+ "<fim_pad>",
16
+ "<fim_prefix>",
17
+ "<fim_suffix>",
18
+ "<function_call>",
19
+ "<gh_stars>",
20
+ "]<]speech[>[",
21
+ "]<]image[>[",
22
+ "]<]video[>[",
23
+ "]<]start of speech[>[",
24
+ "]<]end of speech[>[",
25
+ "]<]start of image[>[",
26
+ "]<]end of image[>[",
27
+ "]<]start of video[>[",
28
+ "]<]end of video[>[",
29
+ "]<]vision pad[>[",
30
+ "]~!b[",
31
+ "<issue_closed>",
32
+ "<issue_comment>",
33
+ "<issue_start>",
34
+ "<jupyter_code>",
35
+ "<jupyter_output>",
36
+ "<jupyter_start>",
37
+ "<jupyter_text>",
38
+ "<reponame>",
39
+ "[e~[",
40
+ "]!d~[",
41
+ "]!p~[",
42
+ "]~b]",
43
+ "<jupyter_error>",
44
+ "<add_file>",
45
+ "<delete_file>",
46
+ "<rename_file>",
47
+ "<edit_file>",
48
+ "<commit_message>",
49
+ "<empty_source_file>",
50
+ "<repo_struct>",
51
+ "<code_context>",
52
+ "<file_content>",
53
+ "<source_files>",
54
+ "<pr_start>",
55
+ "<review_comment>",
56
+ "<filepath>",
57
+ "<file_sep>"
58
+ ],
59
+ "is_local": false,
60
+ "model_max_length": 40960000,
61
+ "tokenizer_class": "TokenizersBackend",
62
+ "unk_token": "]!d~["
63
+ }