Upload folder using huggingface_hub
Browse files- teaching_arithmetic/out/addition_plain/addition_plain/config.yaml +80 -0
- teaching_arithmetic/out/addition_plain/addition_plain/result.csv +22 -0
- teaching_arithmetic/out/addition_plain/ckpt_10000.pt +3 -0
- teaching_arithmetic/out/addition_plain/ckpt_10000_acc.pt +3 -0
- teaching_arithmetic/out/addition_plain/ckpt_10000_final.pt +3 -0
teaching_arithmetic/out/addition_plain/addition_plain/config.yaml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
add_space: false
|
2 |
+
algo_reason: false
|
3 |
+
always_save_checkpoint: false
|
4 |
+
backend: nccl
|
5 |
+
batch_size: 256
|
6 |
+
beta1: 0.9
|
7 |
+
beta2: 0.99
|
8 |
+
bias: false
|
9 |
+
binary: false
|
10 |
+
block_size: 256
|
11 |
+
ckpt_path_name: ckpt_10000.pt
|
12 |
+
compile: true
|
13 |
+
data_format: plain
|
14 |
+
data_ratio: 0.2
|
15 |
+
data_shuffle: true
|
16 |
+
data_type: text
|
17 |
+
dataset: bal
|
18 |
+
decay_lr: true
|
19 |
+
device: cuda:0
|
20 |
+
dropout: 0.2
|
21 |
+
dtype: bfloat16
|
22 |
+
eps: 0
|
23 |
+
eval_addition: true
|
24 |
+
eval_addition_ar: false
|
25 |
+
eval_addition_train: true
|
26 |
+
eval_interval: 250
|
27 |
+
eval_iters: 200
|
28 |
+
eval_only: false
|
29 |
+
eval_other: false
|
30 |
+
eval_text: false
|
31 |
+
eval_text_data_path: null
|
32 |
+
exp_name: addition_plain
|
33 |
+
grad_clip: 1.0
|
34 |
+
gradient_accumulation_steps: 1
|
35 |
+
init_from: scratch
|
36 |
+
learning_rate: 0.001
|
37 |
+
log_interval: 10
|
38 |
+
lr_decay_iters: 5000
|
39 |
+
max_iters: 5000
|
40 |
+
meta_path_specified: true
|
41 |
+
min_lr: null
|
42 |
+
multi_digit: false
|
43 |
+
n_embd: 384
|
44 |
+
n_head: 6
|
45 |
+
n_layer: 6
|
46 |
+
num_digit: 5
|
47 |
+
operator: +
|
48 |
+
other_operator: +
|
49 |
+
out_dir: out/addition_plain
|
50 |
+
print_interval: 2
|
51 |
+
random_A: false
|
52 |
+
random_C: false
|
53 |
+
result_dir: out/addition_plain/addition_plain
|
54 |
+
resume_dir: null
|
55 |
+
resume_iter: false
|
56 |
+
reverse_ab: false
|
57 |
+
reverse_c: false
|
58 |
+
save_final: true
|
59 |
+
simple: false
|
60 |
+
start: FILE:data/bal/test_10000.txt
|
61 |
+
start_ar: null
|
62 |
+
start_other: null
|
63 |
+
start_train: null
|
64 |
+
test_batch_size: 128
|
65 |
+
tokenizer: char
|
66 |
+
train_both: false
|
67 |
+
train_data_path: train_3digit_10000.txt
|
68 |
+
train_data_path2: train_addition.bin
|
69 |
+
use_flash: true
|
70 |
+
use_lora: false
|
71 |
+
val_data_path: val.bin
|
72 |
+
val_data_path2: val_addition.bin
|
73 |
+
vocabulary: all_ascii_chars
|
74 |
+
wandb_entity: ssdd
|
75 |
+
wandb_log: false
|
76 |
+
wandb_project: addition
|
77 |
+
wandb_run_name: addition_plain
|
78 |
+
warmup_iters: 100
|
79 |
+
weight_decay: 0.1
|
80 |
+
zero_pad: false
|
teaching_arithmetic/out/addition_plain/addition_plain/result.csv
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iter,train_loss,val_loss,val_ppl,test_acc,train_acc,test_acc_ar,test_acc_other
|
2 |
+
0,4.558201789855957,4.557050704956055,,0.0,0.0,,
|
3 |
+
250,1.7286615371704102,1.7170720100402832,,0.0707070707070707,0.16999999999999998,,
|
4 |
+
500,1.4398878812789917,1.4310863018035889,,5.434343434343434,5.96,,
|
5 |
+
750,1.2331775426864624,1.2203372716903687,,73.54545454545455,67.83,,
|
6 |
+
1000,1.1358768939971924,1.2272201776504517,,75.29292929292929,70.32000000000001,,
|
7 |
+
1250,0.5539495348930359,1.7250535488128662,,77.78787878787878,73.18,,
|
8 |
+
1500,0.1001376137137413,2.863405227661133,,78.0,74.55000000000001,,
|
9 |
+
1750,0.07596367597579956,3.4369161128997803,,78.83838383838383,77.46,,
|
10 |
+
2000,0.06951306015253067,3.724381685256958,,78.57575757575758,78.39,,
|
11 |
+
2250,0.06603887677192688,3.890638828277588,,79.06060606060606,78.68,,
|
12 |
+
2500,0.06290150433778763,3.9894931316375732,,79.94949494949495,82.75,,
|
13 |
+
2750,0.06049436703324318,4.099140167236328,,80.77777777777779,81.78,,
|
14 |
+
3000,0.05838387832045555,4.249746322631836,,80.53535353535354,83.32000000000001,,
|
15 |
+
3250,0.057155631482601166,4.353604316711426,,81.17171717171718,83.25,,
|
16 |
+
3500,0.055782441049814224,4.412276268005371,,81.0,83.96000000000001,,
|
17 |
+
3750,0.054552655667066574,4.499052047729492,,81.4040404040404,84.98,,
|
18 |
+
4000,0.053756218403577805,4.524080276489258,,80.96969696969697,85.02,,
|
19 |
+
4250,0.05292206257581711,4.601434707641602,,81.5050505050505,84.89,,
|
20 |
+
4500,0.052253320813179016,4.638243675231934,,81.73737373737374,85.17,,
|
21 |
+
4750,0.0518411286175251,4.682107448577881,,81.43434343434343,85.61,,
|
22 |
+
5000,0.051410287618637085,4.731619358062744,,81.51515151515152,86.03,,
|
teaching_arithmetic/out/addition_plain/ckpt_10000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b6e21682cc49d528b55dd0b7569ffedb9f5a259f20fb30faddc983c6cfedf05
|
3 |
+
size 129138715
|
teaching_arithmetic/out/addition_plain/ckpt_10000_acc.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac751f03da9a54a32944b82532d2530352c89f6e23e7f53c5933c4697b29e685
|
3 |
+
size 129139359
|
teaching_arithmetic/out/addition_plain/ckpt_10000_final.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b02d470faa1df8f459c2b19670e716be7f44db1a4243b604585ce57648f1c94e
|
3 |
+
size 129139681
|