amirali1985 commited on
Commit
6904bbc
·
verified ·
1 Parent(s): 77daef9

Upload add_sub_baseline_50K

Browse files
add_sub_baseline_50K/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention"
19
+ ],
20
+ "max_position_embeddings": 128,
21
+ "max_window_layers": 28,
22
+ "model_type": "qwen3",
23
+ "num_attention_heads": 4,
24
+ "num_hidden_layers": 3,
25
+ "num_key_value_heads": 4,
26
+ "pad_token_id": null,
27
+ "rms_norm_eps": 1e-06,
28
+ "rope_parameters": {
29
+ "rope_theta": 10000.0,
30
+ "rope_type": "default"
31
+ },
32
+ "sliding_window": null,
33
+ "tie_word_embeddings": false,
34
+ "transformers_version": "5.5.0",
35
+ "use_cache": true,
36
+ "use_sliding_window": false,
37
+ "vocab_size": 151645
38
+ }
add_sub_baseline_50K/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_baseline_50K/metrics.json ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 400,
12
+ 450,
13
+ 500,
14
+ 550,
15
+ 600,
16
+ 650,
17
+ 700,
18
+ 750,
19
+ 800,
20
+ 850,
21
+ 900,
22
+ 950,
23
+ 1000,
24
+ 1050,
25
+ 1100,
26
+ 1150,
27
+ 1200,
28
+ 1250,
29
+ 1300,
30
+ 1350,
31
+ 1400,
32
+ 1450,
33
+ 1500,
34
+ 1550,
35
+ 1600,
36
+ 1650,
37
+ 1700,
38
+ 1750,
39
+ 1800,
40
+ 1850,
41
+ 1900,
42
+ 1950,
43
+ 2000,
44
+ 2050,
45
+ 2100,
46
+ 2150,
47
+ 2200,
48
+ 2250,
49
+ 2300
50
+ ],
51
+ "loss": [
52
+ 9.407320022583008,
53
+ 7.224453926086426,
54
+ 6.355410099029541,
55
+ 5.141878604888916,
56
+ 3.429792642593384,
57
+ 2.3642735481262207,
58
+ 1.9619863033294678,
59
+ 1.8256255388259888,
60
+ 1.7376317977905273,
61
+ 1.3590067625045776,
62
+ 0.5938534736633301,
63
+ 0.4759748876094818,
64
+ 0.3052070438861847,
65
+ 0.1909075677394867,
66
+ 0.17617182433605194,
67
+ 0.1818520724773407,
68
+ 0.11216778308153152,
69
+ 0.12447719275951385,
70
+ 0.07145003229379654,
71
+ 0.08844210207462311,
72
+ 0.06295949220657349,
73
+ 0.03105434775352478,
74
+ 0.038688015192747116,
75
+ 0.04730251058936119,
76
+ 0.04191047325730324,
77
+ 0.05501880869269371,
78
+ 0.0545913390815258,
79
+ 0.03572807461023331,
80
+ 0.022829385474324226,
81
+ 0.0187111496925354,
82
+ 0.025716397911310196,
83
+ 0.0280207097530365,
84
+ 0.018566560000181198,
85
+ 0.01017980556935072,
86
+ 0.021484609693288803,
87
+ 0.006599620915949345,
88
+ 0.005485634785145521,
89
+ 0.012738294899463654,
90
+ 0.004766442347317934,
91
+ 0.006911102682352066,
92
+ 0.007414546329528093,
93
+ 0.011577660217881203,
94
+ 0.005165865179151297,
95
+ 0.01003621332347393,
96
+ 0.006265468895435333,
97
+ 0.00291456445120275
98
+ ],
99
+ "base_loss": [
100
+ 9.407320022583008,
101
+ 7.224453926086426,
102
+ 6.355410099029541,
103
+ 5.141878604888916,
104
+ 3.429792642593384,
105
+ 2.3642735481262207,
106
+ 1.9619863033294678,
107
+ 1.8256255388259888,
108
+ 1.7376317977905273,
109
+ 1.3590067625045776,
110
+ 0.5938534736633301,
111
+ 0.4759748876094818,
112
+ 0.3052070438861847,
113
+ 0.1909075677394867,
114
+ 0.17617182433605194,
115
+ 0.1818520724773407,
116
+ 0.11216778308153152,
117
+ 0.12447719275951385,
118
+ 0.07145003229379654,
119
+ 0.08844210207462311,
120
+ 0.06295949220657349,
121
+ 0.03105434775352478,
122
+ 0.038688015192747116,
123
+ 0.04730251058936119,
124
+ 0.04191047325730324,
125
+ 0.05501880869269371,
126
+ 0.0545913390815258,
127
+ 0.03572807461023331,
128
+ 0.022829385474324226,
129
+ 0.0187111496925354,
130
+ 0.025716397911310196,
131
+ 0.0280207097530365,
132
+ 0.018566560000181198,
133
+ 0.01017980556935072,
134
+ 0.021484609693288803,
135
+ 0.006599620915949345,
136
+ 0.005485634785145521,
137
+ 0.012738294899463654,
138
+ 0.004766442347317934,
139
+ 0.006911102682352066,
140
+ 0.007414546329528093,
141
+ 0.011577660217881203,
142
+ 0.005165865179151297,
143
+ 0.01003621332347393,
144
+ 0.006265468895435333,
145
+ 0.00291456445120275
146
+ ],
147
+ "lr": [
148
+ 8.358208955223882e-06,
149
+ 1.6886993603411513e-05,
150
+ 2.541577825159915e-05,
151
+ 3.394456289978678e-05,
152
+ 4.247334754797441e-05,
153
+ 5.100213219616205e-05,
154
+ 5.953091684434969e-05,
155
+ 6.805970149253732e-05,
156
+ 7.658848614072494e-05,
157
+ 7.994958584913144e-05,
158
+ 7.96419594562729e-05,
159
+ 7.905686510146095e-05,
160
+ 7.819839806506031e-05,
161
+ 7.707256705860444e-05,
162
+ 7.568725216771852e-05,
163
+ 7.405214969663954e-05,
164
+ 7.217870430038601e-05,
165
+ 7.008002887960812e-05,
166
+ 6.777081279880229e-05,
167
+ 6.526721907030291e-05,
168
+ 6.258677122369667e-05,
169
+ 5.9748230652499966e-05,
170
+ 5.677146529659303e-05,
171
+ 5.3677310579548395e-05,
172
+ 5.04874235742021e-05,
173
+ 4.722413141721374e-05,
174
+ 4.391027503361521e-05,
175
+ 4.056904926517458e-05,
176
+ 3.722384052157265e-05,
177
+ 3.3898063090728774e-05,
178
+ 3.061499525399694e-05,
179
+ 2.7397616353319584e-05,
180
+ 2.4268445950762915e-05,
181
+ 2.1249386206212863e-05,
182
+ 1.8361568576485003e-05,
183
+ 1.5625205908855437e-05,
184
+ 1.3059450964261466e-05,
185
+ 1.0682262360418084e-05,
186
+ 8.510278873161129e-06,
187
+ 6.558702975826241e-06,
188
+ 4.841194431812048e-06,
189
+ 3.3697746851107672e-06,
190
+ 2.1547427180102297e-06,
191
+ 1.2046029649088297e-06,
192
+ 5.260057867998169e-07,
193
+ 1.2370092306488446e-07
194
+ ]
195
+ },
196
+ "final_accuracy": 0.98
197
+ }
add_sub_baseline_50K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5490f49379b81c1cae6704d8090541cdb0c1699eeb937ba75a009f520c0e01f8
3
+ size 671794850
add_sub_baseline_50K/train_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "baseline",
3
+ "ops": "add_sub",
4
+ "n_digits": 6,
5
+ "n_layer": 3,
6
+ "n_head": 4,
7
+ "n_embd": 512,
8
+ "abs_vocab": 0,
9
+ "K": 4,
10
+ "batch_size": 64,
11
+ "num_epochs": 3,
12
+ "dataset_size": 50000,
13
+ "lr": 8e-05,
14
+ "output_dir": "ckpt/r/add_sub_baseline_50K",
15
+ "device": "cuda",
16
+ "push_to_hub": true,
17
+ "no_wandb": false,
18
+ "n_params": 167871744,
19
+ "run_name": "add_sub_baseline_50K",
20
+ "git_commit": "9e4530548a98f8c7f5c14930ac4aec4886bb4b1b",
21
+ "timestamp": "2026-04-07T05:27:33.004161",
22
+ "tokenizer": "Qwen/Qwen3-0.6B",
23
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
24
+ "dataset_config": "add_sub_6digit",
25
+ "model_repo": "thoughtworks/arithmetic-sorl",
26
+ "trainer_version": "sft",
27
+ "final_accuracy": 0.98
28
+ }