Snider Cladius Maximus commited on
Commit
7559613
·
1 Parent(s): d5adada

fix: apply proven configs from mlx-community/unsloth references

Browse files

Fresh weights from full multimodal Google base model.
Configs matched against repos with millions of downloads.

Co-Authored-By: Cladius Maximus <cladius@lethean.io>

README.md CHANGED
@@ -1,7 +1,9 @@
1
  ---
2
- language: en
3
  library_name: mlx
 
 
4
  pipeline_tag: text-generation
5
  tags:
6
  - mlx
 
7
  ---
 
1
  ---
 
2
  library_name: mlx
3
+ license: apache-2.0
4
+ license_link: https://ai.google.dev/gemma/docs/gemma_4_license
5
  pipeline_tag: text-generation
6
  tags:
7
  - mlx
8
+ base_model: google/gemma-4-e4b-it
9
  ---
config.json CHANGED
@@ -161,5 +161,48 @@
161
  "tie_word_embeddings": true,
162
  "transformers_version": "5.5.0.dev0",
163
  "video_token_id": 258884,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  "vision_soft_tokens_per_image": 280
165
- }
 
161
  "tie_word_embeddings": true,
162
  "transformers_version": "5.5.0.dev0",
163
  "video_token_id": 258884,
164
+ "vision_config": {
165
+ "_name_or_path": "",
166
+ "architectures": null,
167
+ "attention_bias": false,
168
+ "attention_dropout": 0.0,
169
+ "chunk_size_feed_forward": 0,
170
+ "default_output_length": 280,
171
+ "dtype": "bfloat16",
172
+ "global_head_dim": 64,
173
+ "head_dim": 64,
174
+ "hidden_activation": "gelu_pytorch_tanh",
175
+ "hidden_size": 768,
176
+ "id2label": {
177
+ "0": "LABEL_0",
178
+ "1": "LABEL_1"
179
+ },
180
+ "initializer_range": 0.02,
181
+ "intermediate_size": 3072,
182
+ "is_encoder_decoder": false,
183
+ "label2id": {
184
+ "LABEL_0": 0,
185
+ "LABEL_1": 1
186
+ },
187
+ "max_position_embeddings": 131072,
188
+ "model_type": "gemma4_vision",
189
+ "num_attention_heads": 12,
190
+ "num_hidden_layers": 16,
191
+ "num_key_value_heads": 12,
192
+ "output_attentions": false,
193
+ "output_hidden_states": false,
194
+ "patch_size": 16,
195
+ "pooling_kernel_size": 3,
196
+ "position_embedding_size": 10240,
197
+ "problem_type": null,
198
+ "return_dict": true,
199
+ "rms_norm_eps": 1e-06,
200
+ "rope_parameters": {
201
+ "rope_theta": 100.0,
202
+ "rope_type": "default"
203
+ },
204
+ "standardize": false,
205
+ "use_clipped_linears": true
206
+ },
207
  "vision_soft_tokens_per_image": 280
208
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33c0732399ce6381b18895eae1440b648a5e9d73f542a944e3af943a95f7a750
3
- size 4269485663
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e7b458e101cafb6d157d395535d1dd0832b4382eda753cf1b0e509ad1bafab0
3
+ size 4229918563
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 4269294164,
4
  "total_parameters": 7518068992
5
  },
6
  "weight_map": {
@@ -1481,6 +1481,8 @@
1481
  "language_model.model.layers.9.self_attn.v_proj.scales": "model.safetensors",
1482
  "language_model.model.layers.9.self_attn.v_proj.weight": "model.safetensors",
1483
  "language_model.model.norm.weight": "model.safetensors",
 
 
1484
  "language_model.model.per_layer_model_projection.weight": "model.safetensors",
1485
  "language_model.model.per_layer_projection_norm.weight": "model.safetensors"
1486
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 4229726804,
4
  "total_parameters": 7518068992
5
  },
6
  "weight_map": {
 
1481
  "language_model.model.layers.9.self_attn.v_proj.scales": "model.safetensors",
1482
  "language_model.model.layers.9.self_attn.v_proj.weight": "model.safetensors",
1483
  "language_model.model.norm.weight": "model.safetensors",
1484
+ "language_model.model.per_layer_model_projection.biases": "model.safetensors",
1485
+ "language_model.model.per_layer_model_projection.scales": "model.safetensors",
1486
  "language_model.model.per_layer_model_projection.weight": "model.safetensors",
1487
  "language_model.model.per_layer_projection_norm.weight": "model.safetensors"
1488
  }
processor_config.json CHANGED
@@ -1,27 +1,5 @@
1
  {
2
- "audio_ms_per_token": 40,
3
  "audio_seq_length": 750,
4
- "feature_extractor": {
5
- "dither": 0.0,
6
- "feature_extractor_type": "Gemma4AudioFeatureExtractor",
7
- "feature_size": 128,
8
- "fft_length": 512,
9
- "fft_overdrive": false,
10
- "frame_length": 320,
11
- "hop_length": 160,
12
- "input_scale_factor": 1.0,
13
- "max_frequency": 8000.0,
14
- "mel_floor": 0.001,
15
- "min_frequency": 0.0,
16
- "padding_side": "right",
17
- "padding_value": 0.0,
18
- "per_bin_mean": null,
19
- "per_bin_stddev": null,
20
- "preemphasis": 0.0,
21
- "preemphasis_htk_flavor": true,
22
- "return_attention_mask": true,
23
- "sampling_rate": 16000
24
- },
25
  "image_processor": {
26
  "do_convert_rgb": true,
27
  "do_normalize": false,
@@ -43,33 +21,22 @@
43
  "patch_size": 16,
44
  "pooling_kernel_size": 3,
45
  "resample": 3,
46
- "rescale_factor": 0.00392156862745098
 
 
 
 
47
  },
48
  "image_seq_length": 280,
49
  "processor_class": "Gemma4Processor",
50
- "video_processor": {
51
- "do_convert_rgb": true,
52
- "do_normalize": true,
53
- "do_rescale": true,
54
- "do_resize": true,
55
- "do_sample_frames": true,
56
- "image_mean": [
57
- 0.0,
58
- 0.0,
59
- 0.0
60
- ],
61
- "image_std": [
62
- 1.0,
63
- 1.0,
64
- 1.0
65
- ],
66
- "max_soft_tokens": 70,
67
- "num_frames": 32,
68
- "patch_size": 16,
69
- "pooling_kernel_size": 3,
70
- "resample": 3,
71
- "rescale_factor": 0.00392156862745098,
72
- "return_metadata": false,
73
- "video_processor_type": "Gemma4VideoProcessor"
74
- }
75
- }
 
1
  {
 
2
  "audio_seq_length": 750,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "image_processor": {
4
  "do_convert_rgb": true,
5
  "do_normalize": false,
 
21
  "patch_size": 16,
22
  "pooling_kernel_size": 3,
23
  "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 224,
27
+ "width": 224
28
+ }
29
  },
30
  "image_seq_length": 280,
31
  "processor_class": "Gemma4Processor",
32
+ "feature_extractor": {
33
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
34
+ "sampling_rate": 16000,
35
+ "num_mel_filters": 128,
36
+ "fft_length": 512,
37
+ "hop_length": 160,
38
+ "chunk_duration": 8.0,
39
+ "overlap_duration": 1.0
40
+ },
41
+ "audio_ms_per_token": 40
42
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_config.json CHANGED
@@ -17,50 +17,71 @@
17
  "<|video|>"
18
  ],
19
  "image_token": "<|image|>",
 
20
  "mask_token": "<mask>",
21
  "model_max_length": 1000000000000000019884624838656,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "pad_token": "<pad>",
23
  "padding_side": "left",
24
  "processor_class": "Gemma4Processor",
25
  "response_schema": {
26
- "type": "object",
27
  "properties": {
 
 
 
28
  "role": {
29
  "const": "assistant"
30
  },
31
  "thinking": {
32
  "type": "string"
33
  },
34
- "content": {
35
- "type": "string"
36
- },
37
  "tool_calls": {
38
- "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>",
39
- "type": "array",
40
  "items": {
41
- "type": "object",
42
  "properties": {
43
- "type": {
44
- "const": "function"
45
- },
46
  "function": {
47
- "type": "object",
48
- "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})",
49
  "properties": {
50
- "name": {
51
- "type": "string"
52
- },
53
  "arguments": {
 
54
  "type": "object",
55
- "x-parser": "gemma4-tool-call",
56
- "additionalProperties": {}
 
 
57
  }
58
- }
 
 
 
 
 
59
  }
60
- }
61
- }
 
 
 
62
  }
63
  },
 
64
  "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<content>(?:(?!\\<\\|tool_call\\>)(?!\\<turn\\|\\>).)+)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?:\\<turn\\|\\>)?"
65
  },
66
  "soc_token": "<|channel>",
 
17
  "<|video|>"
18
  ],
19
  "image_token": "<|image|>",
20
+ "is_local": true,
21
  "mask_token": "<mask>",
22
  "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
  "pad_token": "<pad>",
44
  "padding_side": "left",
45
  "processor_class": "Gemma4Processor",
46
  "response_schema": {
 
47
  "properties": {
48
+ "content": {
49
+ "type": "string"
50
+ },
51
  "role": {
52
  "const": "assistant"
53
  },
54
  "thinking": {
55
  "type": "string"
56
  },
 
 
 
57
  "tool_calls": {
 
 
58
  "items": {
 
59
  "properties": {
 
 
 
60
  "function": {
 
 
61
  "properties": {
 
 
 
62
  "arguments": {
63
+ "additionalProperties": {},
64
  "type": "object",
65
+ "x-parser": "gemma4-tool-call"
66
+ },
67
+ "name": {
68
+ "type": "string"
69
  }
70
+ },
71
+ "type": "object",
72
+ "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
73
+ },
74
+ "type": {
75
+ "const": "function"
76
  }
77
+ },
78
+ "type": "object"
79
+ },
80
+ "type": "array",
81
+ "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
82
  }
83
  },
84
+ "type": "object",
85
  "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<content>(?:(?!\\<\\|tool_call\\>)(?!\\<turn\\|\\>).)+)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?:\\<turn\\|\\>)?"
86
  },
87
  "soc_token": "<|channel>",