Training in progress, step 1000

Browse files

Files changed (4) hide show

config.json +63 -15
generation_config.json +4 -4
model.safetensors +1 -1
training_args.bin +1 -1

config.json CHANGED Viewed

@@ -279,20 +279,19 @@
   "system_prompt": "",
   "temperature": 0.7,
   "text_config": {
-    "_name_or_path": "Qwen/Qwen3-1.7B",
     "architectures": [
-      "Qwen3ForCausalLM"
     ],
     "attention_bias": false,
     "attention_dropout": 0.0,
     "bos_token_id": null,
     "dtype": "bfloat16",
-    "eos_token_id": 151645,
-    "head_dim": 128,
     "hidden_act": "silu",
     "hidden_size": 2048,
     "initializer_range": 0.02,
-    "intermediate_size": 6144,
     "layer_types": [
       "full_attention",
       "full_attention",
@@ -321,27 +320,76 @@
       "full_attention",
       "full_attention",
       "full_attention",
       "full_attention"
     ],
-    "max_position_embeddings": 40960,
     "max_window_layers": 28,
-    "model_type": "qwen3",
     "num_attention_heads": 16,
-    "num_hidden_layers": 28,
-    "num_key_value_heads": 8,
-    "pad_token_id": 151643,
     "rms_norm_eps": 1e-06,
     "rope_parameters": {
-      "rope_theta": 1000000,
       "rope_type": "default"
     },
     "sliding_window": null,
     "tie_word_embeddings": true,
-    "use_cache": true,
     "use_sliding_window": false,
-    "vocab_size": 151670
   },
-  "text_model_id": "Qwen/Qwen3-1.7B",
   "time_mask_length": 100,
   "top_k": null,
   "top_p": null,
@@ -349,5 +397,5 @@
   "use_cache": false,
   "use_lora": false,
   "use_specaugment": true,
-  "vocab_size": 151670
 }

   "system_prompt": "",
   "temperature": 0.7,
   "text_config": {
+    "_name_or_path": "HuggingFaceTB/SmolLM3-3B",
     "architectures": [
+      "SmolLM3ForCausalLM"
     ],
     "attention_bias": false,
     "attention_dropout": 0.0,
     "bos_token_id": null,
     "dtype": "bfloat16",
+    "eos_token_id": 128012,
     "hidden_act": "silu",
     "hidden_size": 2048,
     "initializer_range": 0.02,
+    "intermediate_size": 11008,
     "layer_types": [
       "full_attention",
       "full_attention",
       "full_attention",
       "full_attention",
       "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
       "full_attention"
     ],
+    "max_position_embeddings": 65536,
     "max_window_layers": 28,
+    "mlp_bias": false,
+    "model_type": "smollm3",
+    "no_rope_layer_interval": 4,
+    "no_rope_layers": [
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1,
+      0
+    ],
     "num_attention_heads": 16,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 4,
+    "pad_token_id": 128004,
+    "pretraining_tp": 2,
     "rms_norm_eps": 1e-06,
     "rope_parameters": {
+      "rope_theta": 5000000.0,
       "rope_type": "default"
     },
     "sliding_window": null,
     "tie_word_embeddings": true,
+    "use_cache": false,
     "use_sliding_window": false,
+    "vocab_size": 128257
   },
+  "text_model_id": "HuggingFaceTB/SmolLM3-3B",
   "time_mask_length": 100,
   "top_k": null,
   "top_p": null,
   "use_cache": false,
   "use_lora": false,
   "use_specaugment": true,
+  "vocab_size": 128257
 }

generation_config.json CHANGED Viewed

@@ -1,16 +1,16 @@
 {
-  "bos_token_id": 151643,
   "do_sample": true,
   "eos_token_id": [
-    151645,
-    151643
   ],
   "length_penalty": 1.0,
   "max_new_tokens": 128,
   "min_new_tokens": 0,
   "no_repeat_ngram_size": 0,
   "num_beams": 1,
-  "pad_token_id": 151643,
   "repetition_penalty": 1.05,
   "temperature": 0.7,
   "transformers_version": "5.0.0.dev0",

 {
+  "bos_token_id": 128000,
   "do_sample": true,
   "eos_token_id": [
+    128012,
+    null
   ],
   "length_penalty": 1.0,
   "max_new_tokens": 128,
   "min_new_tokens": 0,
   "no_repeat_ngram_size": 0,
   "num_beams": 1,
+  "pad_token_id": 128004,
   "repetition_penalty": 1.05,
   "temperature": 0.7,
   "transformers_version": "5.0.0.dev0",

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:21bc671971dc0afa4c8724009fa53e8939e00b4a2c2248e4f97caca316c1f236
 size 58732960

 version https://git-lfs.github.com/spec/v1
+oid sha256:f2edc6701b98756dfb86241a8eccc63a7f29eedf9898bf9cf85742e7b2f3be77
 size 58732960

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:060e9c0aa3404e278bdec0b14f0a68b938b1ec109a6bbd119f1c4dfcbb908299
 size 5265

 version https://git-lfs.github.com/spec/v1
+oid sha256:dae774817b9d14faaedcd0353adbe6446f55e18bc682604a3a87a18f9e9ec0cb
 size 5265