amirali1985 commited on
Commit
3aab210
·
verified ·
1 Parent(s): d1932c1

Upload add_baseline_25K

Browse files
add_baseline_25K/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention"
19
+ ],
20
+ "max_position_embeddings": 128,
21
+ "max_window_layers": 28,
22
+ "model_type": "qwen3",
23
+ "num_attention_heads": 4,
24
+ "num_hidden_layers": 3,
25
+ "num_key_value_heads": 4,
26
+ "pad_token_id": null,
27
+ "rms_norm_eps": 1e-06,
28
+ "rope_parameters": {
29
+ "rope_theta": 10000.0,
30
+ "rope_type": "default"
31
+ },
32
+ "sliding_window": null,
33
+ "tie_word_embeddings": false,
34
+ "transformers_version": "5.5.0",
35
+ "use_cache": true,
36
+ "use_sliding_window": false,
37
+ "vocab_size": 151645
38
+ }
add_baseline_25K/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_baseline_25K/metrics.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 400,
12
+ 450,
13
+ 500,
14
+ 550,
15
+ 600,
16
+ 650,
17
+ 700,
18
+ 750,
19
+ 800,
20
+ 850,
21
+ 900,
22
+ 950,
23
+ 1000,
24
+ 1050,
25
+ 1100,
26
+ 1150
27
+ ],
28
+ "loss": [
29
+ 8.406509399414062,
30
+ 6.529412269592285,
31
+ 4.597049713134766,
32
+ 2.5594711303710938,
33
+ 1.7348458766937256,
34
+ 1.4393640756607056,
35
+ 0.8803462982177734,
36
+ 0.4436863958835602,
37
+ 0.24189461767673492,
38
+ 0.17245309054851532,
39
+ 0.13498370349407196,
40
+ 0.11610734462738037,
41
+ 0.044174227863550186,
42
+ 0.06227850541472435,
43
+ 0.03381551057100296,
44
+ 0.02790527604520321,
45
+ 0.021734651178121567,
46
+ 0.019603630527853966,
47
+ 0.007907840423285961,
48
+ 0.015318186022341251,
49
+ 0.012655793689191341,
50
+ 0.009013988077640533,
51
+ 0.005673403386026621
52
+ ],
53
+ "base_loss": [
54
+ 8.406509399414062,
55
+ 6.529412269592285,
56
+ 4.597049713134766,
57
+ 2.5594711303710938,
58
+ 1.7348458766937256,
59
+ 1.4393640756607056,
60
+ 0.8803462982177734,
61
+ 0.4436863958835602,
62
+ 0.24189461767673492,
63
+ 0.17245309054851532,
64
+ 0.13498370349407196,
65
+ 0.11610734462738037,
66
+ 0.044174227863550186,
67
+ 0.06227850541472435,
68
+ 0.03381551057100296,
69
+ 0.02790527604520321,
70
+ 0.021734651178121567,
71
+ 0.019603630527853966,
72
+ 0.007907840423285961,
73
+ 0.015318186022341251,
74
+ 0.012655793689191341,
75
+ 0.009013988077640533,
76
+ 0.005673403386026621
77
+ ],
78
+ "lr": [
79
+ 1.6752136752136756e-05,
80
+ 3.384615384615385e-05,
81
+ 5.094017094017095e-05,
82
+ 6.803418803418804e-05,
83
+ 7.994963951276301e-05,
84
+ 7.905786527705838e-05,
85
+ 7.707564529070769e-05,
86
+ 7.405832060590692e-05,
87
+ 7.009013107697279e-05,
88
+ 6.528186349112191e-05,
89
+ 5.976775854276414e-05,
90
+ 5.370176300464045e-05,
91
+ 4.725323173040355e-05,
92
+ 4.060219948324443e-05,
93
+ 3.3934354595074675e-05,
94
+ 2.7435854785285614e-05,
95
+ 2.1288129874808147e-05,
96
+ 1.566281649706339e-05,
97
+ 1.0716966222210186e-05,
98
+ 6.5886608777444526e-06,
99
+ 3.3931574801034573e-06,
100
+ 1.219670405864477e-06,
101
+ 1.2888064021131298e-07
102
+ ]
103
+ },
104
+ "final_accuracy": 0.995
105
+ }
add_baseline_25K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04f707c002ca332f02ba34764baa3b022c00d104a3e9baa0e8c0b106d6a19233
3
+ size 671794850
add_baseline_25K/train_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "baseline",
3
+ "ops": "add",
4
+ "n_digits": 6,
5
+ "n_layer": 3,
6
+ "n_head": 4,
7
+ "n_embd": 512,
8
+ "abs_vocab": 0,
9
+ "K": 4,
10
+ "batch_size": 64,
11
+ "num_epochs": 3,
12
+ "dataset_size": 25000,
13
+ "lr": 8e-05,
14
+ "output_dir": "ckpt/r/add_baseline_25K",
15
+ "device": "cuda",
16
+ "push_to_hub": true,
17
+ "no_wandb": false,
18
+ "n_params": 167871744,
19
+ "run_name": "add_baseline_25K",
20
+ "git_commit": "9e4530548a98f8c7f5c14930ac4aec4886bb4b1b",
21
+ "timestamp": "2026-04-07T05:23:18.157225",
22
+ "tokenizer": "Qwen/Qwen3-0.6B",
23
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
24
+ "dataset_config": "add_6digit",
25
+ "model_repo": "thoughtworks/arithmetic-sorl",
26
+ "trainer_version": "sft",
27
+ "final_accuracy": 0.995
28
+ }