Upload add_baseline_25K

Browse files

Files changed (5) hide show

add_baseline_25K/config.json +38 -0
add_baseline_25K/generation_config.json +7 -0
add_baseline_25K/metrics.json +105 -0
add_baseline_25K/model.safetensors +3 -0
add_baseline_25K/train_config.json +28 -0

add_baseline_25K/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "architectures": [
+    "SorlModelWrapper"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": null,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 128,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 3,
+  "num_key_value_heads": 4,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.5.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151645
+}

add_baseline_25K/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "transformers_version": "5.5.0",
+  "use_cache": true
+}

add_baseline_25K/metrics.json ADDED Viewed

	@@ -0,0 +1,105 @@

+{
+  "history": {
+    "step": [
+      50,
+      100,
+      150,
+      200,
+      250,
+      300,
+      350,
+      400,
+      450,
+      500,
+      550,
+      600,
+      650,
+      700,
+      750,
+      800,
+      850,
+      900,
+      950,
+      1000,
+      1050,
+      1100,
+      1150
+    ],
+    "loss": [
+      8.406509399414062,
+      6.529412269592285,
+      4.597049713134766,
+      2.5594711303710938,
+      1.7348458766937256,
+      1.4393640756607056,
+      0.8803462982177734,
+      0.4436863958835602,
+      0.24189461767673492,
+      0.17245309054851532,
+      0.13498370349407196,
+      0.11610734462738037,
+      0.044174227863550186,
+      0.06227850541472435,
+      0.03381551057100296,
+      0.02790527604520321,
+      0.021734651178121567,
+      0.019603630527853966,
+      0.007907840423285961,
+      0.015318186022341251,
+      0.012655793689191341,
+      0.009013988077640533,
+      0.005673403386026621
+    ],
+    "base_loss": [
+      8.406509399414062,
+      6.529412269592285,
+      4.597049713134766,
+      2.5594711303710938,
+      1.7348458766937256,
+      1.4393640756607056,
+      0.8803462982177734,
+      0.4436863958835602,
+      0.24189461767673492,
+      0.17245309054851532,
+      0.13498370349407196,
+      0.11610734462738037,
+      0.044174227863550186,
+      0.06227850541472435,
+      0.03381551057100296,
+      0.02790527604520321,
+      0.021734651178121567,
+      0.019603630527853966,
+      0.007907840423285961,
+      0.015318186022341251,
+      0.012655793689191341,
+      0.009013988077640533,
+      0.005673403386026621
+    ],
+    "lr": [
+      1.6752136752136756e-05,
+      3.384615384615385e-05,
+      5.094017094017095e-05,
+      6.803418803418804e-05,
+      7.994963951276301e-05,
+      7.905786527705838e-05,
+      7.707564529070769e-05,
+      7.405832060590692e-05,
+      7.009013107697279e-05,
+      6.528186349112191e-05,
+      5.976775854276414e-05,
+      5.370176300464045e-05,
+      4.725323173040355e-05,
+      4.060219948324443e-05,
+      3.3934354595074675e-05,
+      2.7435854785285614e-05,
+      2.1288129874808147e-05,
+      1.566281649706339e-05,
+      1.0716966222210186e-05,
+      6.5886608777444526e-06,
+      3.3931574801034573e-06,
+      1.219670405864477e-06,
+      1.2888064021131298e-07
+    ]
+  },
+  "final_accuracy": 0.995
+}

add_baseline_25K/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04f707c002ca332f02ba34764baa3b022c00d104a3e9baa0e8c0b106d6a19233
+size 671794850

add_baseline_25K/train_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "mode": "baseline",
+  "ops": "add",
+  "n_digits": 6,
+  "n_layer": 3,
+  "n_head": 4,
+  "n_embd": 512,
+  "abs_vocab": 0,
+  "K": 4,
+  "batch_size": 64,
+  "num_epochs": 3,
+  "dataset_size": 25000,
+  "lr": 8e-05,
+  "output_dir": "ckpt/r/add_baseline_25K",
+  "device": "cuda",
+  "push_to_hub": true,
+  "no_wandb": false,
+  "n_params": 167871744,
+  "run_name": "add_baseline_25K",
+  "git_commit": "9e4530548a98f8c7f5c14930ac4aec4886bb4b1b",
+  "timestamp": "2026-04-07T05:23:18.157225",
+  "tokenizer": "Qwen/Qwen3-0.6B",
+  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
+  "dataset_config": "add_6digit",
+  "model_repo": "thoughtworks/arithmetic-sorl",
+  "trainer_version": "sft",
+  "final_accuracy": 0.995
+}