Sumeru rebrand: SumeruForCausalLM, auto_map, scrubbed metadata

Browse files

Files changed (5) hide show

README.md +49 -13
__init__.py +2 -0
config.json +198 -190
configuration_sumeru.py +11 -0
modeling_sumeru.py +27 -0

README.md CHANGED Viewed

@@ -1,21 +1,57 @@
 ---
-base_model: unsloth/gemma-4-e4b-it-unsloth-bnb-4bit
-tags:
-- text-generation-inference
-- transformers
-- unsloth
-- gemma4
 license: apache-2.0
 language:
-- en
 ---
-# Uploaded finetuned  model
-- **Developed by:** debowd
-- **License:** apache-2.0
-- **Finetuned from model :** unsloth/gemma-4-e4b-it-unsloth-bnb-4bit
-This gemma4 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
-[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)

 ---
 license: apache-2.0
 language:
+  - en
+  - hi
+  - gu
+  - ta
+  - bn
+  - mr
+tags:
+  - sumeru
+  - education
+  - ncert
+  - jee
+  - neet
+  - india
+pipeline_tag: text-generation
 ---
+# Sumeru-rm
+**Sumeru-rm** (Research Mini) is a compact science and mathematics model from
+**Sumeru AI**, purpose-built for the Indian education curriculum: NCERT
+Classes 6-12 plus JEE and NEET preparation. Subjects: Physics, Chemistry,
+Biology, Mathematics.
+## Usage
+```python
+from transformers import AutoModelForImageTextToText, AutoProcessor
+processor = AutoProcessor.from_pretrained("debowd/sumeru-rm", trust_remote_code=True)
+model     = AutoModelForImageTextToText.from_pretrained(
+    "debowd/sumeru-rm", trust_remote_code=True, device_map="auto"
+)
+messages = [{"role": "user", "content": [{"type": "text",
+    "text": "[Physics | Class 11 | Laws of Motion | Easy]\n\nState Newton's second law."}]}]
+text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+inputs = processor(None, text, return_tensors="pt").to(model.device)
+output = model.generate(**inputs, max_new_tokens=300)
+print(processor.tokenizer.decode(output[0], skip_special_tokens=True))
+```
+## Architecture
+- Class: `SumeruForCausalLM`
+- Model type: `sumeru`
+- Loaded via `trust_remote_code=True`
+## License
+Apache 2.0. Copyright (c) 2026 Sumeru AI.
+## Developed by
+[Sumeru AI](https://huggingface.co/debowd) — Indian education AI.

__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration_sumeru import SumeruConfig
2	+ from .modeling_sumeru import SumeruForCausalLM

config.json CHANGED Viewed

@@ -1,200 +1,208 @@
 {
-    "architectures": [
-        "Gemma4ForConditionalGeneration"
-    ],
-    "audio_config": {
-        "_name_or_path": "",
-        "architectures": null,
-        "attention_chunk_size": 12,
-        "attention_context_left": 13,
-        "attention_context_right": 0,
-        "attention_invalid_logits_value": -1000000000.0,
-        "attention_logit_cap": 50.0,
-        "chunk_size_feed_forward": 0,
-        "conv_kernel_size": 5,
-        "torch_dtype": "bfloat16",
-        "gradient_clipping": 10000000000.0,
-        "hidden_act": "silu",
-        "hidden_size": 1024,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "initializer_range": 0.02,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "model_type": "gemma4_audio",
-        "num_attention_heads": 8,
-        "num_hidden_layers": 12,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "output_proj_dims": 1536,
-        "problem_type": null,
-        "residual_weight": 0.5,
-        "return_dict": true,
-        "rms_norm_eps": 1e-06,
-        "subsampling_conv_channels": [
-            128,
-            32
-        ],
-        "use_clipped_linears": true
     },
-    "audio_token_id": 258881,
-    "boa_token_id": 256000,
-    "boi_token_id": 255999,
     "bos_token_id": 2,
     "torch_dtype": "bfloat16",
-    "eoa_token_id": 258883,
-    "eoa_token_index": 258883,
-    "eoi_token_id": 258882,
-    "eos_token_id": 106,
-    "image_token_id": 258880,
     "initializer_range": 0.02,
-    "model_name": "unsloth/gemma-4-e4b-it-unsloth-bnb-4bit",
-    "model_type": "gemma4",
     "pad_token_id": 0,
-    "text_config": {
-        "attention_bias": false,
-        "attention_dropout": 0.0,
-        "attention_k_eq_v": false,
-        "bos_token_id": 2,
-        "torch_dtype": "bfloat16",
-        "enable_moe_block": false,
-        "eos_token_id": 1,
-        "expert_intermediate_size": null,
-        "final_logit_softcapping": 30.0,
-        "global_head_dim": 512,
-        "head_dim": 256,
-        "hidden_activation": "gelu_pytorch_tanh",
-        "hidden_size": 2560,
-        "hidden_size_per_layer_input": 256,
-        "initializer_range": 0.02,
-        "intermediate_size": 10240,
-        "layer_types": [
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "full_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "full_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "full_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "full_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "full_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "full_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "sliding_attention",
-            "full_attention"
-        ],
-        "max_position_embeddings": 131072,
-        "model_type": "gemma4_text",
-        "moe_intermediate_size": null,
-        "num_attention_heads": 8,
-        "num_experts": null,
-        "num_global_key_value_heads": null,
-        "num_hidden_layers": 42,
-        "num_key_value_heads": 2,
-        "num_kv_shared_layers": 18,
-        "pad_token_id": 0,
-        "rms_norm_eps": 1e-06,
-        "rope_parameters": {
-            "full_attention": {
-                "partial_rotary_factor": 0.25,
-                "rope_theta": 1000000.0,
-                "rope_type": "proportional"
-            },
-            "sliding_attention": {
-                "rope_theta": 10000.0,
-                "rope_type": "default"
-            }
-        },
-        "sliding_window": 512,
-        "tie_word_embeddings": true,
-        "top_k_experts": null,
-        "use_bidirectional_attention": null,
-        "use_cache": true,
-        "use_double_wide_mlp": false,
-        "vocab_size": 262144,
-        "vocab_size_per_layer_input": 262144
     },
     "tie_word_embeddings": true,
-    "unsloth_fixed": true,
-    "unsloth_version": "2026.4.8",
-    "use_cache": false,
-    "video_token_id": 258884,
-    "vision_config": {
-        "_name_or_path": "",
-        "architectures": null,
-        "attention_bias": false,
-        "attention_dropout": 0.0,
-        "chunk_size_feed_forward": 0,
-        "default_output_length": 280,
-        "torch_dtype": "bfloat16",
-        "global_head_dim": 64,
-        "head_dim": 64,
-        "hidden_activation": "gelu_pytorch_tanh",
-        "hidden_size": 768,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "initializer_range": 0.02,
-        "intermediate_size": 3072,
-        "is_encoder_decoder": false,
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "max_position_embeddings": 131072,
-        "model_type": "gemma4_vision",
-        "num_attention_heads": 12,
-        "num_hidden_layers": 16,
-        "num_key_value_heads": 12,
-        "output_attentions": false,
-        "output_hidden_states": false,
-        "patch_size": 16,
-        "pooling_kernel_size": 3,
-        "position_embedding_size": 10240,
-        "problem_type": null,
-        "return_dict": true,
-        "rms_norm_eps": 1e-06,
-        "rope_parameters": {
-            "rope_theta": 100.0,
-            "rope_type": "default"
-        },
-        "standardize": false,
-        "use_clipped_linears": true
     },
-    "vision_soft_tokens_per_image": 280
 }

 {
+  "architectures": [
+    "SumeruForCausalLM"
+  ],
+  "audio_config": {
+    "_name_or_path": "",
+    "architectures": null,
+    "attention_chunk_size": 12,
+    "attention_context_left": 13,
+    "attention_context_right": 0,
+    "attention_invalid_logits_value": -1000000000.0,
+    "attention_logit_cap": 50.0,
+    "chunk_size_feed_forward": 0,
+    "conv_kernel_size": 5,
+    "torch_dtype": "bfloat16",
+    "gradient_clipping": 10000000000.0,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
     },
+    "initializer_range": 0.02,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "model_type": "gemma4_audio",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 12,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_proj_dims": 1536,
+    "problem_type": null,
+    "residual_weight": 0.5,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "subsampling_conv_channels": [
+      128,
+      32
+    ],
+    "use_clipped_linears": true
+  },
+  "audio_token_id": 258881,
+  "boa_token_id": 256000,
+  "boi_token_id": 255999,
+  "bos_token_id": 2,
+  "torch_dtype": "bfloat16",
+  "eoa_token_id": 258883,
+  "eoa_token_index": 258883,
+  "eoi_token_id": 258882,
+  "eos_token_id": 106,
+  "image_token_id": 258880,
+  "initializer_range": 0.02,
+  "model_name": "unsloth/gemma-4-e4b-it-unsloth-bnb-4bit",
+  "model_type": "sumeru",
+  "pad_token_id": 0,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_k_eq_v": false,
     "bos_token_id": 2,
     "torch_dtype": "bfloat16",
+    "enable_moe_block": false,
+    "eos_token_id": 1,
+    "expert_intermediate_size": null,
+    "final_logit_softcapping": 30.0,
+    "global_head_dim": 512,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 2560,
+    "hidden_size_per_layer_input": 256,
     "initializer_range": 0.02,
+    "intermediate_size": 10240,
+    "layer_types": [
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "gemma4_text",
+    "moe_intermediate_size": null,
+    "num_attention_heads": 8,
+    "num_experts": null,
+    "num_global_key_value_heads": null,
+    "num_hidden_layers": 42,
+    "num_key_value_heads": 2,
+    "num_kv_shared_layers": 18,
     "pad_token_id": 0,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "full_attention": {
+        "partial_rotary_factor": 0.25,
+        "rope_theta": 1000000.0,
+        "rope_type": "proportional"
+      },
+      "sliding_attention": {
+        "rope_theta": 10000.0,
+        "rope_type": "default"
+      }
     },
+    "sliding_window": 512,
     "tie_word_embeddings": true,
+    "top_k_experts": null,
+    "use_bidirectional_attention": null,
+    "use_cache": true,
+    "use_double_wide_mlp": false,
+    "vocab_size": 262144,
+    "vocab_size_per_layer_input": 262144
+  },
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "unsloth_version": "2026.4.8",
+  "use_cache": false,
+  "video_token_id": 258884,
+  "vision_config": {
+    "_name_or_path": "",
+    "architectures": null,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "chunk_size_feed_forward": 0,
+    "default_output_length": 280,
+    "torch_dtype": "bfloat16",
+    "global_head_dim": 64,
+    "head_dim": 64,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "max_position_embeddings": 131072,
+    "model_type": "gemma4_vision",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 16,
+    "num_key_value_heads": 12,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "position_embedding_size": 10240,
+    "problem_type": null,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 100.0,
+      "rope_type": "default"
     },
+    "standardize": false,
+    "use_clipped_linears": true
+  },
+  "vision_soft_tokens_per_image": 280,
+  "auto_map": {
+    "AutoConfig": "configuration_sumeru.SumeruConfig",
+    "AutoModelForCausalLM": "modeling_sumeru.SumeruForCausalLM",
+    "AutoModelForImageTextToText": "modeling_sumeru.SumeruForCausalLM"
+  },
+  "sumeru_version": "1.0",
+  "developed_by": "Sumeru AI",
+  "license": "apache-2.0"
 }

configuration_sumeru.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Sumeru configuration — thin subclass of Gemma 4 config.
+Copyright (c) 2026 Sumeru AI. Licensed under Apache 2.0.
+"""
+from transformers import Gemma4Config
+class SumeruConfig(Gemma4Config):
+    """Configuration class for Sumeru models."""
+    model_type = "sumeru"

modeling_sumeru.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Sumeru model class — thin wrapper over Gemma 4 architecture.
+Copyright (c) 2026 Sumeru AI. Licensed under Apache 2.0.
+Loaded automatically via:
+    from transformers import AutoModelForImageTextToText, AutoProcessor
+    model = AutoModelForImageTextToText.from_pretrained(
+        "debowd/sumeru-rm", trust_remote_code=True
+    )
+"""
+from transformers import Gemma4ForConditionalGeneration
+from .configuration_sumeru import SumeruConfig
+class SumeruForCausalLM(Gemma4ForConditionalGeneration):
+    """Sumeru causal language model.
+    Sumeru is a family of small language models built by Sumeru AI for
+    Indian education — NCERT curriculum, JEE, and NEET preparation across
+    Physics, Chemistry, Biology, and Mathematics.
+    """
+    config_class = SumeruConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.post_init()