DatPySci commited on
Commit
d7c3124
·
verified ·
1 Parent(s): 4fc1433

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. models/OLMo-1B/config.yaml +413 -0
  3. models/OLMo-1B/data-indices/rank0.tsv.gz +3 -0
  4. models/OLMo-1B/data-indices/rank1.tsv.gz +3 -0
  5. models/OLMo-1B/data-indices/rank2.tsv.gz +3 -0
  6. models/OLMo-1B/data-indices/rank3.tsv.gz +3 -0
  7. models/OLMo-1B/step0/config.yaml +413 -0
  8. models/OLMo-1B/step0/rank0.pt +3 -0
  9. models/OLMo-1B/step0/rank1.pt +3 -0
  10. models/OLMo-1B/step0/rank2.pt +3 -0
  11. models/OLMo-1B/step0/rank3.pt +3 -0
  12. models/OLMo-1B/step12000-unsharded/config.yaml +413 -0
  13. models/OLMo-1B/step12000-unsharded/model.pt +3 -0
  14. models/OLMo-1B/step12000-unsharded/optim.pt +3 -0
  15. models/OLMo-1B/step12000-unsharded/train.pt +3 -0
  16. models/OLMo-1B/step12000/config.json +26 -0
  17. models/OLMo-1B/step12000/config.yaml +413 -0
  18. models/OLMo-1B/step12000/generation_config.json +6 -0
  19. models/OLMo-1B/step12000/pytorch_model.bin +3 -0
  20. models/OLMo-1B/step12000/rank0.pt +3 -0
  21. models/OLMo-1B/step12000/rank1.pt +3 -0
  22. models/OLMo-1B/step12000/rank2.pt +3 -0
  23. models/OLMo-1B/step12000/rank3.pt +3 -0
  24. models/OLMo-1B/step12000/special_tokens_map.json +3 -0
  25. models/OLMo-1B/step12000/tokenizer.json +0 -0
  26. models/OLMo-1B/step12000/tokenizer_config.json +39 -0
  27. models/OLMo-1B/step15000-unsharded/config.yaml +413 -0
  28. models/OLMo-1B/step15000-unsharded/model.pt +3 -0
  29. models/OLMo-1B/step15000-unsharded/optim.pt +3 -0
  30. models/OLMo-1B/step15000-unsharded/train.pt +3 -0
  31. models/OLMo-1B/step15000/config.json +26 -0
  32. models/OLMo-1B/step15000/config.yaml +413 -0
  33. models/OLMo-1B/step15000/generation_config.json +6 -0
  34. models/OLMo-1B/step15000/pytorch_model.bin +3 -0
  35. models/OLMo-1B/step15000/rank0.pt +3 -0
  36. models/OLMo-1B/step15000/rank1.pt +3 -0
  37. models/OLMo-1B/step15000/rank2.pt +3 -0
  38. models/OLMo-1B/step15000/rank3.pt +3 -0
  39. models/OLMo-1B/step15000/special_tokens_map.json +3 -0
  40. models/OLMo-1B/step15000/tokenizer.json +0 -0
  41. models/OLMo-1B/step15000/tokenizer_config.json +39 -0
  42. models/OLMo-1B/step18000-unsharded/config.yaml +413 -0
  43. models/OLMo-1B/step18000-unsharded/model.pt +3 -0
  44. models/OLMo-1B/step18000-unsharded/optim.pt +3 -0
  45. models/OLMo-1B/step18000-unsharded/train.pt +3 -0
  46. models/OLMo-1B/step18000/config.json +26 -0
  47. models/OLMo-1B/step18000/config.yaml +413 -0
  48. models/OLMo-1B/step18000/generation_config.json +6 -0
  49. models/OLMo-1B/step18000/pytorch_model.bin +3 -0
  50. models/OLMo-1B/step18000/rank0.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/OLMo-1B/wandb/wandb/run-20251214_154801-0cm37isp/run-0cm37isp.wandb filter=lfs diff=lfs merge=lfs -text
37
+ models/OLMo-1B/wandb/wandb/run-20251215_030807-9hgp01eh/run-9hgp01eh.wandb filter=lfs diff=lfs merge=lfs -text
38
+ models/OLMo-1B/wandb/wandb/run-20251216_022244-czbynfrm/run-czbynfrm.wandb filter=lfs diff=lfs merge=lfs -text
39
+ models/OLMo-1B/wandb/wandb/run-20251216_042206-huw71c6j/run-huw71c6j.wandb filter=lfs diff=lfs merge=lfs -text
40
+ models/OLMo-1B/wandb/wandb/run-20251217_030134-4zzl7ont/run-4zzl7ont.wandb filter=lfs diff=lfs merge=lfs -text
41
+ models/OLMo-1B/wandb/wandb/run-20251218_041159-wuflp8vz/run-wuflp8vz.wandb filter=lfs diff=lfs merge=lfs -text
42
+ models/OLMo-1B/wandb/wandb/run-20251219_032942-aw0e7ij9/run-aw0e7ij9.wandb filter=lfs diff=lfs merge=lfs -text
43
+ models/OLMo-1B/wandb/wandb/run-20251219_083508-zvk9qxz6/run-zvk9qxz6.wandb filter=lfs diff=lfs merge=lfs -text
models/OLMo-1B/config.yaml ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-1B-as_fm3_omi2
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: false
22
+ attention_dropout: 0.0
23
+ multi_query_attention: false
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: true
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: steps
66
+ t_warmup: 2000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
+ - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
+ - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
+ - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
+ - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
+ - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
+ - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
+ - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
+ - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
+ - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
+ - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
+ - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
+ - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
+ - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
+ - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
+ - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
+ - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
+ - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
+ - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
+ - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
+ - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
+ - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
+ - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
+ - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
+ - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
+ - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
+ - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
+ - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
+ - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
+ - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
+ - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
+ - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
+ - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
+ - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
+ - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
+ - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
+ - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
+ - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
+ - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
+ - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
+ - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
+ - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
+ - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
+ - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
+ - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
+ - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
+ - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
+ - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
+ - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
+ - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
+ - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
+ - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
+ - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
+ - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
+ - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
+ - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
+ - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
+ - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
+ - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
+ - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
+ - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
+ - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
+ - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
+ - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
+ - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
+ - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
+ - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
+ - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
+ - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
+ - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
+ - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
+ - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
+ - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
+ - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
+ - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
+ - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
+ - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
+ - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
+ - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
+ - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
+ - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
+ - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
+ - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
+ - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
+ - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
+ - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
+ - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
+ - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
+ - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
+ - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
+ - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
+ - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
+ - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
+ - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
+ - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
+ - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
+ - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
+ - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
+ - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
+ - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
+ - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
+ - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
+ memmap_dtype: uint16
322
+ datasets: null
323
+ label_mask_paths: null
324
+ pad_direction: right
325
+ generate_attention_mask: false
326
+ generate_doc_lengths: false
327
+ num_workers: 32
328
+ drop_last: true
329
+ pin_memory: true
330
+ prefetch_factor: 8
331
+ persistent_workers: true
332
+ timeout: 0
333
+ seed: null
334
+ instance_filter: null
335
+ custom_dataset: null
336
+ restore_dataloader: true
337
+ fast_forward_batches: null
338
+ evaluators: []
339
+ eval_interval: 5000
340
+ tokenizer:
341
+ identifier: meta-llama/Llama-2-7b-hf
342
+ truncate_direction: right
343
+ save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
+ remote_save_folder: null
345
+ canceled_check_interval: 6000
346
+ save_interval: 3000
347
+ save_interval_unsharded: 3000
348
+ save_interval_ephemeral: null
349
+ save_num_checkpoints_to_keep: -1
350
+ save_num_unsharded_checkpoints_to_keep: -1
351
+ save_overwrite: true
352
+ force_save_unsharded: false
353
+ no_pre_train_checkpoint: false
354
+ load_path: step9000-unsharded
355
+ load_path_sharded_checkpointer: null
356
+ try_load_latest_save: false
357
+ reset_optimizer_state: false
358
+ reset_trainer_state: false
359
+ sharded_checkpointer: torch_legacy
360
+ new_style_checkpoints: null
361
+ max_duration: 1ep
362
+ global_train_batch_size: 512
363
+ device_train_batch_size: 128
364
+ device_train_microbatch_size: 16
365
+ device_eval_batch_size: 16
366
+ eval_subset_num_batches: -1
367
+ eval_on_load: false
368
+ device_train_grad_accum: 8
369
+ max_grad_norm: 1.0
370
+ max_grad_norm_ratio: null
371
+ precision: amp_bf16
372
+ wandb:
373
+ project: olmo-debug
374
+ entity: null
375
+ group: null
376
+ name: OLMo-1B-as_fm3_omi2
377
+ tags:
378
+ - watching
379
+ log_artifacts: false
380
+ rank_zero_only: true
381
+ log_interval: 1
382
+ speed_monitor:
383
+ window_size: 20
384
+ gpu_flops_available: null
385
+ console_log_interval: 1
386
+ gen1_gc_interval: 1
387
+ compile: null
388
+ distributed_strategy: fsdp
389
+ fsdp:
390
+ use_orig_params: true
391
+ sharding_strategy: FULL_SHARD
392
+ wrapping_strategy: null
393
+ precision: mixed
394
+ hybrid_sharding_num_model_replicas: null
395
+ ddp:
396
+ grad_sync_mode: batch
397
+ find_unused_params: false
398
+ single:
399
+ device: auto
400
+ softmax_auxiliary_loss: false
401
+ auxiliary_loss_multiplier: 0.0001
402
+ time_limit: null
403
+ extra_steps_after_cancel: 10
404
+ early_stopping_factor: null
405
+ save_data_indices: true
406
+ python_profiling: false
407
+ torch_profiling: false
408
+ stop_at: null
409
+ stop_after: null
410
+ activation_checkpointing: null
411
+ fused_loss: null
412
+ hf_datasets_cache_dir: null
413
+ module_outputs_save_steps: null
models/OLMo-1B/data-indices/rank0.tsv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa4500dfe712cc9eb0c7124191ffb1e8e55a67983c179782c4ee79097c7b0adb
3
+ size 5892706
models/OLMo-1B/data-indices/rank1.tsv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0945afe472f69ffcfefd79cf51fc5cd1b9ed4bfa6cc7d4c7ebc31f90f2a5603b
3
+ size 5893668
models/OLMo-1B/data-indices/rank2.tsv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2f42505ef01d33a09089c9c9821d8fff6e74809e6d2c3d576e7e4086b4e0150
3
+ size 5893712
models/OLMo-1B/data-indices/rank3.tsv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a7406f65dab671d3aa4b961ed253f70018b0cc00dfcee461dc54bf7d26fe8fc
3
+ size 5892979
models/OLMo-1B/step0/config.yaml ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-1B-as_fm3_omi2
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: false
22
+ attention_dropout: 0.0
23
+ multi_query_attention: false
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: true
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: steps
66
+ t_warmup: 2000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
+ - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
+ - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
+ - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
+ - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
+ - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
+ - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
+ - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
+ - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
+ - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
+ - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
+ - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
+ - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
+ - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
+ - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
+ - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
+ - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
+ - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
+ - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
+ - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
+ - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
+ - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
+ - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
+ - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
+ - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
+ - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
+ - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
+ - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
+ - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
+ - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
+ - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
+ - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
+ - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
+ - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
+ - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
+ - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
+ - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
+ - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
+ - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
+ - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
+ - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
+ - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
+ - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
+ - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
+ - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
+ - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
+ - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
+ - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
+ - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
+ - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
+ - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
+ - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
+ - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
+ - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
+ - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
+ - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
+ - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
+ - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
+ - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
+ - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
+ - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
+ - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
+ - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
+ - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
+ - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
+ - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
+ - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
+ - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
+ - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
+ - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
+ - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
+ - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
+ - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
+ - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
+ - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
+ - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
+ - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
+ - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
+ - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
+ - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
+ - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
+ - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
+ - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
+ - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
+ - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
+ - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
+ - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
+ - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
+ - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
+ - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
+ - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
+ - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
+ - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
+ - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
+ - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
+ - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
+ - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
+ - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
+ - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
+ - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
+ - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
+ - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
+ memmap_dtype: uint16
322
+ datasets: null
323
+ label_mask_paths: null
324
+ pad_direction: right
325
+ generate_attention_mask: false
326
+ generate_doc_lengths: false
327
+ num_workers: 32
328
+ drop_last: true
329
+ pin_memory: true
330
+ prefetch_factor: 8
331
+ persistent_workers: true
332
+ timeout: 0
333
+ seed: null
334
+ instance_filter: null
335
+ custom_dataset: null
336
+ restore_dataloader: true
337
+ fast_forward_batches: null
338
+ evaluators: []
339
+ eval_interval: 5000
340
+ tokenizer:
341
+ identifier: meta-llama/Llama-2-7b-hf
342
+ truncate_direction: right
343
+ save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
+ remote_save_folder: null
345
+ canceled_check_interval: 6000
346
+ save_interval: 3000
347
+ save_interval_unsharded: 3000
348
+ save_interval_ephemeral: null
349
+ save_num_checkpoints_to_keep: -1
350
+ save_num_unsharded_checkpoints_to_keep: -1
351
+ save_overwrite: true
352
+ force_save_unsharded: false
353
+ no_pre_train_checkpoint: false
354
+ load_path: null
355
+ load_path_sharded_checkpointer: null
356
+ try_load_latest_save: false
357
+ reset_optimizer_state: false
358
+ reset_trainer_state: false
359
+ sharded_checkpointer: torch_legacy
360
+ new_style_checkpoints: null
361
+ max_duration: 1ep
362
+ global_train_batch_size: 512
363
+ device_train_batch_size: 128
364
+ device_train_microbatch_size: 16
365
+ device_eval_batch_size: 16
366
+ eval_subset_num_batches: -1
367
+ eval_on_load: false
368
+ device_train_grad_accum: 8
369
+ max_grad_norm: 1.0
370
+ max_grad_norm_ratio: null
371
+ precision: amp_bf16
372
+ wandb:
373
+ project: olmo-debug
374
+ entity: null
375
+ group: null
376
+ name: OLMo-1B-as_fm3_omi2
377
+ tags:
378
+ - watching
379
+ log_artifacts: false
380
+ rank_zero_only: true
381
+ log_interval: 1
382
+ speed_monitor:
383
+ window_size: 20
384
+ gpu_flops_available: null
385
+ console_log_interval: 1
386
+ gen1_gc_interval: 1
387
+ compile: null
388
+ distributed_strategy: fsdp
389
+ fsdp:
390
+ use_orig_params: true
391
+ sharding_strategy: FULL_SHARD
392
+ wrapping_strategy: null
393
+ precision: mixed
394
+ hybrid_sharding_num_model_replicas: null
395
+ ddp:
396
+ grad_sync_mode: batch
397
+ find_unused_params: false
398
+ single:
399
+ device: auto
400
+ softmax_auxiliary_loss: false
401
+ auxiliary_loss_multiplier: 0.0001
402
+ time_limit: null
403
+ extra_steps_after_cancel: 10
404
+ early_stopping_factor: null
405
+ save_data_indices: true
406
+ python_profiling: false
407
+ torch_profiling: false
408
+ stop_at: null
409
+ stop_after: null
410
+ activation_checkpointing: null
411
+ fused_loss: null
412
+ hf_datasets_cache_dir: null
413
+ module_outputs_save_steps: null
models/OLMo-1B/step0/rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8656900295e73f2975cd8b840d83017c49b9c965e692fd5fe2c97d6b5d3cab13
3
+ size 1139384511
models/OLMo-1B/step0/rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e21d37a4d5882841d9b12b18948620ac67d70aee1dfeb02f04e97fddd0cebf
3
+ size 1139384575
models/OLMo-1B/step0/rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:677da39ec798f0ff13eccf35eab23cdc576d97ddd48b7fe2bc4cef243ce372fe
3
+ size 1139384575
models/OLMo-1B/step0/rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a42c1183824377a0fb3a6d09d1ff637f42a0504f54a65eab12fc2793d0ae813
3
+ size 1139384575
models/OLMo-1B/step12000-unsharded/config.yaml ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-1B-as_fm3_omi2
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: false
22
+ attention_dropout: 0.0
23
+ multi_query_attention: false
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: true
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: steps
66
+ t_warmup: 2000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
+ - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
+ - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
+ - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
+ - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
+ - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
+ - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
+ - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
+ - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
+ - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
+ - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
+ - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
+ - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
+ - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
+ - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
+ - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
+ - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
+ - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
+ - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
+ - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
+ - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
+ - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
+ - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
+ - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
+ - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
+ - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
+ - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
+ - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
+ - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
+ - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
+ - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
+ - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
+ - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
+ - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
+ - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
+ - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
+ - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
+ - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
+ - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
+ - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
+ - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
+ - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
+ - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
+ - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
+ - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
+ - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
+ - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
+ - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
+ - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
+ - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
+ - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
+ - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
+ - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
+ - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
+ - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
+ - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
+ - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
+ - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
+ - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
+ - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
+ - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
+ - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
+ - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
+ - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
+ - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
+ - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
+ - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
+ - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
+ - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
+ - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
+ - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
+ - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
+ - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
+ - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
+ - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
+ - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
+ - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
+ - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
+ - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
+ - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
+ - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
+ - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
+ - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
+ - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
+ - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
+ - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
+ - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
+ - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
+ - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
+ - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
+ - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
+ - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
+ - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
+ - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
+ - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
+ - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
+ - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
+ - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
+ - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
+ - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
+ - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
+ - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
+ memmap_dtype: uint16
322
+ datasets: null
323
+ label_mask_paths: null
324
+ pad_direction: right
325
+ generate_attention_mask: false
326
+ generate_doc_lengths: false
327
+ num_workers: 32
328
+ drop_last: true
329
+ pin_memory: true
330
+ prefetch_factor: 8
331
+ persistent_workers: true
332
+ timeout: 0
333
+ seed: null
334
+ instance_filter: null
335
+ custom_dataset: null
336
+ restore_dataloader: true
337
+ fast_forward_batches: null
338
+ evaluators: []
339
+ eval_interval: 5000
340
+ tokenizer:
341
+ identifier: meta-llama/Llama-2-7b-hf
342
+ truncate_direction: right
343
+ save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
+ remote_save_folder: null
345
+ canceled_check_interval: 6000
346
+ save_interval: 3000
347
+ save_interval_unsharded: 3000
348
+ save_interval_ephemeral: null
349
+ save_num_checkpoints_to_keep: -1
350
+ save_num_unsharded_checkpoints_to_keep: -1
351
+ save_overwrite: true
352
+ force_save_unsharded: false
353
+ no_pre_train_checkpoint: false
354
+ load_path: checkpoints/OLMo-1B-as_fm3_omi2/step9000-unsharded
355
+ load_path_sharded_checkpointer: null
356
+ try_load_latest_save: false
357
+ reset_optimizer_state: false
358
+ reset_trainer_state: false
359
+ sharded_checkpointer: torch_legacy
360
+ new_style_checkpoints: null
361
+ max_duration: 1ep
362
+ global_train_batch_size: 512
363
+ device_train_batch_size: 128
364
+ device_train_microbatch_size: 16
365
+ device_eval_batch_size: 16
366
+ eval_subset_num_batches: -1
367
+ eval_on_load: false
368
+ device_train_grad_accum: 8
369
+ max_grad_norm: 1.0
370
+ max_grad_norm_ratio: null
371
+ precision: amp_bf16
372
+ wandb:
373
+ project: olmo-debug
374
+ entity: null
375
+ group: null
376
+ name: OLMo-1B-as_fm3_omi2
377
+ tags:
378
+ - watching
379
+ log_artifacts: false
380
+ rank_zero_only: true
381
+ log_interval: 1
382
+ speed_monitor:
383
+ window_size: 20
384
+ gpu_flops_available: null
385
+ console_log_interval: 1
386
+ gen1_gc_interval: 1
387
+ compile: null
388
+ distributed_strategy: fsdp
389
+ fsdp:
390
+ use_orig_params: true
391
+ sharding_strategy: FULL_SHARD
392
+ wrapping_strategy: null
393
+ precision: mixed
394
+ hybrid_sharding_num_model_replicas: null
395
+ ddp:
396
+ grad_sync_mode: batch
397
+ find_unused_params: false
398
+ single:
399
+ device: auto
400
+ softmax_auxiliary_loss: false
401
+ auxiliary_loss_multiplier: 0.0001
402
+ time_limit: null
403
+ extra_steps_after_cancel: 10
404
+ early_stopping_factor: null
405
+ save_data_indices: true
406
+ python_profiling: false
407
+ torch_profiling: false
408
+ stop_at: 62228
409
+ stop_after: null
410
+ activation_checkpointing: null
411
+ fused_loss: null
412
+ hf_datasets_cache_dir: null
413
+ module_outputs_save_steps: null
models/OLMo-1B/step12000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1022913cab05be05a63e0163920e25aa0b2ae9dd5ffc07ce618d485114c4049
3
+ size 4557141517
models/OLMo-1B/step12000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:137112cad4dba64842c533a52ee069c16260508ab080735a172fdfd413529a42
3
+ size 9114282430
models/OLMo-1B/step12000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9015dea0efebf624dc52ad6fefafab407fb9f2e08f03d8dea8a3d6584351d90
3
+ size 14796
models/OLMo-1B/step12000/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "OlmoForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "clip_qkv": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8192,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "olmo",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 16,
18
+ "num_key_value_heads": 16,
19
+ "pad_token_id": 1,
20
+ "rope_scaling": null,
21
+ "rope_theta": 10000.0,
22
+ "tie_word_embeddings": true,
23
+ "transformers_version": "4.57.3",
24
+ "use_cache": true,
25
+ "vocab_size": 32000
26
+ }
models/OLMo-1B/step12000/config.yaml ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-1B-as_fm3_omi2
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: false
22
+ attention_dropout: 0.0
23
+ multi_query_attention: false
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: true
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: steps
66
+ t_warmup: 2000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
+ - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
+ - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
+ - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
+ - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
+ - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
+ - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
+ - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
+ - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
+ - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
+ - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
+ - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
+ - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
+ - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
+ - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
+ - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
+ - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
+ - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
+ - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
+ - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
+ - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
+ - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
+ - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
+ - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
+ - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
+ - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
+ - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
+ - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
+ - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
+ - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
+ - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
+ - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
+ - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
+ - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
+ - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
+ - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
+ - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
+ - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
+ - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
+ - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
+ - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
+ - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
+ - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
+ - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
+ - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
+ - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
+ - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
+ - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
+ - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
+ - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
+ - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
+ - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
+ - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
+ - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
+ - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
+ - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
+ - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
+ - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
+ - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
+ - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
+ - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
+ - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
+ - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
+ - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
+ - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
+ - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
+ - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
+ - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
+ - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
+ - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
+ - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
+ - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
+ - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
+ - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
+ - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
+ - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
+ - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
+ - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
+ - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
+ - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
+ - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
+ - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
+ - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
+ - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
+ - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
+ - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
+ - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
+ - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
+ - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
+ - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
+ - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
+ - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
+ - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
+ - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
+ - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
+ - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
+ - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
+ - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
+ - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
+ - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
+ - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
+ - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
+ memmap_dtype: uint16
322
+ datasets: null
323
+ label_mask_paths: null
324
+ pad_direction: right
325
+ generate_attention_mask: false
326
+ generate_doc_lengths: false
327
+ num_workers: 32
328
+ drop_last: true
329
+ pin_memory: true
330
+ prefetch_factor: 8
331
+ persistent_workers: true
332
+ timeout: 0
333
+ seed: null
334
+ instance_filter: null
335
+ custom_dataset: null
336
+ restore_dataloader: true
337
+ fast_forward_batches: null
338
+ evaluators: []
339
+ eval_interval: 5000
340
+ tokenizer:
341
+ identifier: meta-llama/Llama-2-7b-hf
342
+ truncate_direction: right
343
+ save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
+ remote_save_folder: null
345
+ canceled_check_interval: 6000
346
+ save_interval: 3000
347
+ save_interval_unsharded: 3000
348
+ save_interval_ephemeral: null
349
+ save_num_checkpoints_to_keep: -1
350
+ save_num_unsharded_checkpoints_to_keep: -1
351
+ save_overwrite: true
352
+ force_save_unsharded: false
353
+ no_pre_train_checkpoint: false
354
+ load_path: checkpoints/OLMo-1B-as_fm3_omi2/step9000-unsharded
355
+ load_path_sharded_checkpointer: null
356
+ try_load_latest_save: false
357
+ reset_optimizer_state: false
358
+ reset_trainer_state: false
359
+ sharded_checkpointer: torch_legacy
360
+ new_style_checkpoints: null
361
+ max_duration: 1ep
362
+ global_train_batch_size: 512
363
+ device_train_batch_size: 128
364
+ device_train_microbatch_size: 16
365
+ device_eval_batch_size: 16
366
+ eval_subset_num_batches: -1
367
+ eval_on_load: false
368
+ device_train_grad_accum: 8
369
+ max_grad_norm: 1.0
370
+ max_grad_norm_ratio: null
371
+ precision: amp_bf16
372
+ wandb:
373
+ project: olmo-debug
374
+ entity: null
375
+ group: null
376
+ name: OLMo-1B-as_fm3_omi2
377
+ tags:
378
+ - watching
379
+ log_artifacts: false
380
+ rank_zero_only: true
381
+ log_interval: 1
382
+ speed_monitor:
383
+ window_size: 20
384
+ gpu_flops_available: null
385
+ console_log_interval: 1
386
+ gen1_gc_interval: 1
387
+ compile: null
388
+ distributed_strategy: fsdp
389
+ fsdp:
390
+ use_orig_params: true
391
+ sharding_strategy: FULL_SHARD
392
+ wrapping_strategy: null
393
+ precision: mixed
394
+ hybrid_sharding_num_model_replicas: null
395
+ ddp:
396
+ grad_sync_mode: batch
397
+ find_unused_params: false
398
+ single:
399
+ device: auto
400
+ softmax_auxiliary_loss: false
401
+ auxiliary_loss_multiplier: 0.0001
402
+ time_limit: null
403
+ extra_steps_after_cancel: 10
404
+ early_stopping_factor: null
405
+ save_data_indices: true
406
+ python_profiling: false
407
+ torch_profiling: false
408
+ stop_at: 62228
409
+ stop_after: null
410
+ activation_checkpointing: null
411
+ fused_loss: null
412
+ hf_datasets_cache_dir: null
413
+ module_outputs_save_steps: null
models/OLMo-1B/step12000/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.57.3"
6
+ }
models/OLMo-1B/step12000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b4f710e5f4572210ab7946464b3d1064943c6d00191b4efdf27d5073bf35380
3
+ size 4557138549
models/OLMo-1B/step12000/rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2599ff48ed71abde528e18f04c493aeb4bde65c494050e3b2770ad9b63b35f4
3
+ size 3418099026
models/OLMo-1B/step12000/rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da334cb9b8322940d5e8b4828f252cbed7a48a9840a018a366034a291b0753f4
3
+ size 3418099218
models/OLMo-1B/step12000/rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5e88667e930048126e52cff4d12a1dafc6bc1c71f10845aa4bf69689db53329
3
+ size 3418099218
models/OLMo-1B/step12000/rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b80367d698b63187b146e277a2344891851201d414431f1212ce8b6ff5be2453
3
+ size 3418099218
models/OLMo-1B/step12000/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "pad_token": "<s>"
3
+ }
models/OLMo-1B/step12000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/OLMo-1B/step12000/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": null,
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "",
34
+ "extra_special_tokens": {},
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "<s>",
37
+ "tokenizer_class": "GPTNeoXTokenizer",
38
+ "unk_token": null
39
+ }
models/OLMo-1B/step15000-unsharded/config.yaml ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-1B-as_fm3_omi2
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: false
22
+ attention_dropout: 0.0
23
+ multi_query_attention: false
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: true
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: steps
66
+ t_warmup: 2000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
+ - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
+ - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
+ - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
+ - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
+ - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
+ - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
+ - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
+ - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
+ - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
+ - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
+ - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
+ - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
+ - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
+ - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
+ - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
+ - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
+ - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
+ - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
+ - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
+ - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
+ - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
+ - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
+ - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
+ - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
+ - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
+ - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
+ - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
+ - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
+ - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
+ - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
+ - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
+ - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
+ - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
+ - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
+ - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
+ - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
+ - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
+ - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
+ - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
+ - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
+ - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
+ - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
+ - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
+ - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
+ - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
+ - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
+ - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
+ - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
+ - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
+ - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
+ - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
+ - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
+ - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
+ - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
+ - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
+ - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
+ - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
+ - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
+ - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
+ - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
+ - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
+ - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
+ - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
+ - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
+ - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
+ - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
+ - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
+ - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
+ - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
+ - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
+ - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
+ - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
+ - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
+ - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
+ - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
+ - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
+ - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
+ - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
+ - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
+ - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
+ - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
+ - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
+ - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
+ - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
+ - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
+ - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
+ - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
+ - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
+ - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
+ - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
+ - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
+ - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
+ - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
+ - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
+ - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
+ - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
+ - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
+ - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
+ - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
+ - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
+ - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
+ memmap_dtype: uint16
322
+ datasets: null
323
+ label_mask_paths: null
324
+ pad_direction: right
325
+ generate_attention_mask: false
326
+ generate_doc_lengths: false
327
+ num_workers: 32
328
+ drop_last: true
329
+ pin_memory: true
330
+ prefetch_factor: 8
331
+ persistent_workers: true
332
+ timeout: 0
333
+ seed: null
334
+ instance_filter: null
335
+ custom_dataset: null
336
+ restore_dataloader: true
337
+ fast_forward_batches: null
338
+ evaluators: []
339
+ eval_interval: 5000
340
+ tokenizer:
341
+ identifier: meta-llama/Llama-2-7b-hf
342
+ truncate_direction: right
343
+ save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
+ remote_save_folder: null
345
+ canceled_check_interval: 6000
346
+ save_interval: 3000
347
+ save_interval_unsharded: 3000
348
+ save_interval_ephemeral: null
349
+ save_num_checkpoints_to_keep: -1
350
+ save_num_unsharded_checkpoints_to_keep: -1
351
+ save_overwrite: true
352
+ force_save_unsharded: false
353
+ no_pre_train_checkpoint: false
354
+ load_path: checkpoints/OLMo-1B-as_fm3_omi2/step9000-unsharded
355
+ load_path_sharded_checkpointer: null
356
+ try_load_latest_save: false
357
+ reset_optimizer_state: false
358
+ reset_trainer_state: false
359
+ sharded_checkpointer: torch_legacy
360
+ new_style_checkpoints: null
361
+ max_duration: 1ep
362
+ global_train_batch_size: 512
363
+ device_train_batch_size: 128
364
+ device_train_microbatch_size: 16
365
+ device_eval_batch_size: 16
366
+ eval_subset_num_batches: -1
367
+ eval_on_load: false
368
+ device_train_grad_accum: 8
369
+ max_grad_norm: 1.0
370
+ max_grad_norm_ratio: null
371
+ precision: amp_bf16
372
+ wandb:
373
+ project: olmo-debug
374
+ entity: null
375
+ group: null
376
+ name: OLMo-1B-as_fm3_omi2
377
+ tags:
378
+ - watching
379
+ log_artifacts: false
380
+ rank_zero_only: true
381
+ log_interval: 1
382
+ speed_monitor:
383
+ window_size: 20
384
+ gpu_flops_available: null
385
+ console_log_interval: 1
386
+ gen1_gc_interval: 1
387
+ compile: null
388
+ distributed_strategy: fsdp
389
+ fsdp:
390
+ use_orig_params: true
391
+ sharding_strategy: FULL_SHARD
392
+ wrapping_strategy: null
393
+ precision: mixed
394
+ hybrid_sharding_num_model_replicas: null
395
+ ddp:
396
+ grad_sync_mode: batch
397
+ find_unused_params: false
398
+ single:
399
+ device: auto
400
+ softmax_auxiliary_loss: false
401
+ auxiliary_loss_multiplier: 0.0001
402
+ time_limit: null
403
+ extra_steps_after_cancel: 10
404
+ early_stopping_factor: null
405
+ save_data_indices: true
406
+ python_profiling: false
407
+ torch_profiling: false
408
+ stop_at: 62228
409
+ stop_after: null
410
+ activation_checkpointing: null
411
+ fused_loss: null
412
+ hf_datasets_cache_dir: null
413
+ module_outputs_save_steps: null
models/OLMo-1B/step15000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13aa7a3f17fd69605e81dd2a47fb724286a258d6cd2a837fe0139e89942a9421
3
+ size 4557141517
models/OLMo-1B/step15000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8bb8871f6edfb322f3c4e7606e38af0a0f21d318669f444d6015ac92338817f
3
+ size 9114282430
models/OLMo-1B/step15000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abe799545333496e81ffc8a03d72d1650b607424c01861b7b10902e19a53943c
3
+ size 14860
models/OLMo-1B/step15000/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "OlmoForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "clip_qkv": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8192,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "olmo",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 16,
18
+ "num_key_value_heads": 16,
19
+ "pad_token_id": 1,
20
+ "rope_scaling": null,
21
+ "rope_theta": 10000.0,
22
+ "tie_word_embeddings": true,
23
+ "transformers_version": "4.57.3",
24
+ "use_cache": true,
25
+ "vocab_size": 32000
26
+ }
models/OLMo-1B/step15000/config.yaml ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-1B-as_fm3_omi2
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: false
22
+ attention_dropout: 0.0
23
+ multi_query_attention: false
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: true
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: steps
66
+ t_warmup: 2000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
+ - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
+ - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
+ - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
+ - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
+ - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
+ - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
+ - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
+ - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
+ - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
+ - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
+ - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
+ - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
+ - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
+ - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
+ - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
+ - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
+ - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
+ - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
+ - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
+ - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
+ - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
+ - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
+ - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
+ - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
+ - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
+ - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
+ - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
+ - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
+ - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
+ - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
+ - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
+ - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
+ - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
+ - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
+ - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
+ - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
+ - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
+ - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
+ - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
+ - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
+ - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
+ - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
+ - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
+ - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
+ - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
+ - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
+ - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
+ - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
+ - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
+ - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
+ - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
+ - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
+ - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
+ - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
+ - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
+ - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
+ - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
+ - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
+ - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
+ - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
+ - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
+ - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
+ - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
+ - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
+ - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
+ - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
+ - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
+ - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
+ - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
+ - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
+ - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
+ - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
+ - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
+ - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
+ - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
+ - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
+ - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
+ - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
+ - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
+ - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
+ - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
+ - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
+ - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
+ - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
+ - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
+ - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
+ - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
+ - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
+ - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
+ - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
+ - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
+ - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
+ - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
+ - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
+ - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
+ - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
+ - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
+ - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
+ - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
+ - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
+ - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
+ memmap_dtype: uint16
322
+ datasets: null
323
+ label_mask_paths: null
324
+ pad_direction: right
325
+ generate_attention_mask: false
326
+ generate_doc_lengths: false
327
+ num_workers: 32
328
+ drop_last: true
329
+ pin_memory: true
330
+ prefetch_factor: 8
331
+ persistent_workers: true
332
+ timeout: 0
333
+ seed: null
334
+ instance_filter: null
335
+ custom_dataset: null
336
+ restore_dataloader: true
337
+ fast_forward_batches: null
338
+ evaluators: []
339
+ eval_interval: 5000
340
+ tokenizer:
341
+ identifier: meta-llama/Llama-2-7b-hf
342
+ truncate_direction: right
343
+ save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
+ remote_save_folder: null
345
+ canceled_check_interval: 6000
346
+ save_interval: 3000
347
+ save_interval_unsharded: 3000
348
+ save_interval_ephemeral: null
349
+ save_num_checkpoints_to_keep: -1
350
+ save_num_unsharded_checkpoints_to_keep: -1
351
+ save_overwrite: true
352
+ force_save_unsharded: false
353
+ no_pre_train_checkpoint: false
354
+ load_path: checkpoints/OLMo-1B-as_fm3_omi2/step9000-unsharded
355
+ load_path_sharded_checkpointer: null
356
+ try_load_latest_save: false
357
+ reset_optimizer_state: false
358
+ reset_trainer_state: false
359
+ sharded_checkpointer: torch_legacy
360
+ new_style_checkpoints: null
361
+ max_duration: 1ep
362
+ global_train_batch_size: 512
363
+ device_train_batch_size: 128
364
+ device_train_microbatch_size: 16
365
+ device_eval_batch_size: 16
366
+ eval_subset_num_batches: -1
367
+ eval_on_load: false
368
+ device_train_grad_accum: 8
369
+ max_grad_norm: 1.0
370
+ max_grad_norm_ratio: null
371
+ precision: amp_bf16
372
+ wandb:
373
+ project: olmo-debug
374
+ entity: null
375
+ group: null
376
+ name: OLMo-1B-as_fm3_omi2
377
+ tags:
378
+ - watching
379
+ log_artifacts: false
380
+ rank_zero_only: true
381
+ log_interval: 1
382
+ speed_monitor:
383
+ window_size: 20
384
+ gpu_flops_available: null
385
+ console_log_interval: 1
386
+ gen1_gc_interval: 1
387
+ compile: null
388
+ distributed_strategy: fsdp
389
+ fsdp:
390
+ use_orig_params: true
391
+ sharding_strategy: FULL_SHARD
392
+ wrapping_strategy: null
393
+ precision: mixed
394
+ hybrid_sharding_num_model_replicas: null
395
+ ddp:
396
+ grad_sync_mode: batch
397
+ find_unused_params: false
398
+ single:
399
+ device: auto
400
+ softmax_auxiliary_loss: false
401
+ auxiliary_loss_multiplier: 0.0001
402
+ time_limit: null
403
+ extra_steps_after_cancel: 10
404
+ early_stopping_factor: null
405
+ save_data_indices: true
406
+ python_profiling: false
407
+ torch_profiling: false
408
+ stop_at: 62228
409
+ stop_after: null
410
+ activation_checkpointing: null
411
+ fused_loss: null
412
+ hf_datasets_cache_dir: null
413
+ module_outputs_save_steps: null
models/OLMo-1B/step15000/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.57.3"
6
+ }
models/OLMo-1B/step15000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7879a1161aa43f00625455be448d71cecac332624fae17abf47dd663ae3bf992
3
+ size 4557138549
models/OLMo-1B/step15000/rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45429deac2ab5835293510d348ccc2eb9e3d7230a4255b383644555b786741df
3
+ size 3418099154
models/OLMo-1B/step15000/rank1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33eb51d65a91020f3c1372073f89f5421b180ea86470e3a11cf7bf9cfb1597f1
3
+ size 3418099346
models/OLMo-1B/step15000/rank2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0185a8854a19540e0e81a1822fe3547738cb75db357995a4efae6d0b1d0b7401
3
+ size 3418099346
models/OLMo-1B/step15000/rank3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c3dbd6691af149040a05cc159a3bded247a1e6999aa8f37c2882e15565076c6
3
+ size 3418099346
models/OLMo-1B/step15000/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "pad_token": "<s>"
3
+ }
models/OLMo-1B/step15000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/OLMo-1B/step15000/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": null,
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "",
34
+ "extra_special_tokens": {},
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "<s>",
37
+ "tokenizer_class": "GPTNeoXTokenizer",
38
+ "unk_token": null
39
+ }
models/OLMo-1B/step18000-unsharded/config.yaml ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-1B-as_fm3_omi2
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: false
22
+ attention_dropout: 0.0
23
+ multi_query_attention: false
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: true
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: steps
66
+ t_warmup: 2000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
+ - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
+ - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
+ - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
+ - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
+ - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
+ - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
+ - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
+ - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
+ - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
+ - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
+ - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
+ - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
+ - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
+ - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
+ - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
+ - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
+ - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
+ - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
+ - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
+ - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
+ - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
+ - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
+ - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
+ - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
+ - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
+ - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
+ - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
+ - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
+ - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
+ - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
+ - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
+ - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
+ - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
+ - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
+ - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
+ - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
+ - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
+ - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
+ - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
+ - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
+ - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
+ - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
+ - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
+ - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
+ - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
+ - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
+ - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
+ - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
+ - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
+ - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
+ - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
+ - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
+ - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
+ - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
+ - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
+ - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
+ - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
+ - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
+ - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
+ - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
+ - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
+ - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
+ - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
+ - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
+ - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
+ - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
+ - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
+ - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
+ - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
+ - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
+ - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
+ - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
+ - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
+ - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
+ - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
+ - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
+ - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
+ - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
+ - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
+ - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
+ - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
+ - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
+ - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
+ - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
+ - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
+ - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
+ - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
+ - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
+ - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
+ - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
+ - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
+ - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
+ - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
+ - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
+ - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
+ - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
+ - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
+ - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
+ - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
+ - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
+ - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
+ memmap_dtype: uint16
322
+ datasets: null
323
+ label_mask_paths: null
324
+ pad_direction: right
325
+ generate_attention_mask: false
326
+ generate_doc_lengths: false
327
+ num_workers: 32
328
+ drop_last: true
329
+ pin_memory: true
330
+ prefetch_factor: 8
331
+ persistent_workers: true
332
+ timeout: 0
333
+ seed: null
334
+ instance_filter: null
335
+ custom_dataset: null
336
+ restore_dataloader: true
337
+ fast_forward_batches: null
338
+ evaluators: []
339
+ eval_interval: 5000
340
+ tokenizer:
341
+ identifier: meta-llama/Llama-2-7b-hf
342
+ truncate_direction: right
343
+ save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
+ remote_save_folder: null
345
+ canceled_check_interval: 6000
346
+ save_interval: 3000
347
+ save_interval_unsharded: 3000
348
+ save_interval_ephemeral: null
349
+ save_num_checkpoints_to_keep: -1
350
+ save_num_unsharded_checkpoints_to_keep: -1
351
+ save_overwrite: true
352
+ force_save_unsharded: false
353
+ no_pre_train_checkpoint: false
354
+ load_path: checkpoints/OLMo-1B-as_fm3_omi2/step9000-unsharded
355
+ load_path_sharded_checkpointer: null
356
+ try_load_latest_save: false
357
+ reset_optimizer_state: false
358
+ reset_trainer_state: false
359
+ sharded_checkpointer: torch_legacy
360
+ new_style_checkpoints: null
361
+ max_duration: 1ep
362
+ global_train_batch_size: 512
363
+ device_train_batch_size: 128
364
+ device_train_microbatch_size: 16
365
+ device_eval_batch_size: 16
366
+ eval_subset_num_batches: -1
367
+ eval_on_load: false
368
+ device_train_grad_accum: 8
369
+ max_grad_norm: 1.0
370
+ max_grad_norm_ratio: null
371
+ precision: amp_bf16
372
+ wandb:
373
+ project: olmo-debug
374
+ entity: null
375
+ group: null
376
+ name: OLMo-1B-as_fm3_omi2
377
+ tags:
378
+ - watching
379
+ log_artifacts: false
380
+ rank_zero_only: true
381
+ log_interval: 1
382
+ speed_monitor:
383
+ window_size: 20
384
+ gpu_flops_available: null
385
+ console_log_interval: 1
386
+ gen1_gc_interval: 1
387
+ compile: null
388
+ distributed_strategy: fsdp
389
+ fsdp:
390
+ use_orig_params: true
391
+ sharding_strategy: FULL_SHARD
392
+ wrapping_strategy: null
393
+ precision: mixed
394
+ hybrid_sharding_num_model_replicas: null
395
+ ddp:
396
+ grad_sync_mode: batch
397
+ find_unused_params: false
398
+ single:
399
+ device: auto
400
+ softmax_auxiliary_loss: false
401
+ auxiliary_loss_multiplier: 0.0001
402
+ time_limit: null
403
+ extra_steps_after_cancel: 10
404
+ early_stopping_factor: null
405
+ save_data_indices: true
406
+ python_profiling: false
407
+ torch_profiling: false
408
+ stop_at: 62228
409
+ stop_after: null
410
+ activation_checkpointing: null
411
+ fused_loss: null
412
+ hf_datasets_cache_dir: null
413
+ module_outputs_save_steps: null
models/OLMo-1B/step18000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6be5b72cdb67d7d3896f04315d0b34a9e0f7e553c465c68d15bcd7c1b44845e6
3
+ size 4557141517
models/OLMo-1B/step18000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:580674f33c785767189407659ede6da95311b401c9771d3c1d39374cacdf1c51
3
+ size 9114282430
models/OLMo-1B/step18000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7785827feaffc3d6fe4903a0958e42fc0593b35e3469b345faf1f291b9dd1512
3
+ size 14924
models/OLMo-1B/step18000/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "OlmoForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "clip_qkv": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8192,
14
+ "max_position_embeddings": 2048,
15
+ "model_type": "olmo",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 16,
18
+ "num_key_value_heads": 16,
19
+ "pad_token_id": 1,
20
+ "rope_scaling": null,
21
+ "rope_theta": 10000.0,
22
+ "tie_word_embeddings": true,
23
+ "transformers_version": "4.57.3",
24
+ "use_cache": true,
25
+ "vocab_size": 32000
26
+ }
models/OLMo-1B/step18000/config.yaml ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-1B-as_fm3_omi2
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: false
22
+ attention_dropout: 0.0
23
+ multi_query_attention: false
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: true
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: steps
66
+ t_warmup: 2000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
+ - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
+ - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
+ - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
+ - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
+ - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
+ - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
+ - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
+ - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
+ - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
+ - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
+ - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
+ - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
+ - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
+ - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
+ - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
+ - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
+ - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
+ - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
+ - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
+ - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
+ - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
+ - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
+ - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
+ - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
+ - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
+ - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
+ - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
+ - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
+ - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
+ - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
+ - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
+ - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
+ - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
+ - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
+ - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
+ - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
+ - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
+ - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
+ - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
+ - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
+ - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
+ - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
+ - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
+ - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
+ - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
+ - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
+ - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
+ - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
+ - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
+ - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
+ - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
+ - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
+ - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
+ - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
+ - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
+ - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
+ - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
+ - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
+ - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
+ - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
+ - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
+ - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
+ - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
+ - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
+ - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
+ - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
+ - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
+ - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
+ - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
+ - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
+ - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
+ - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
+ - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
+ - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
+ - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
+ - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
+ - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
+ - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
+ - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
+ - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
+ - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
+ - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
+ - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
+ - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
+ - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
+ - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
+ - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
+ - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
+ - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
+ - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
+ - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
+ - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
+ - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
+ - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
+ - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
+ - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
+ - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
+ - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
+ - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
+ - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
+ - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
+ - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
+ - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
+ - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
+ memmap_dtype: uint16
322
+ datasets: null
323
+ label_mask_paths: null
324
+ pad_direction: right
325
+ generate_attention_mask: false
326
+ generate_doc_lengths: false
327
+ num_workers: 32
328
+ drop_last: true
329
+ pin_memory: true
330
+ prefetch_factor: 8
331
+ persistent_workers: true
332
+ timeout: 0
333
+ seed: null
334
+ instance_filter: null
335
+ custom_dataset: null
336
+ restore_dataloader: true
337
+ fast_forward_batches: null
338
+ evaluators: []
339
+ eval_interval: 5000
340
+ tokenizer:
341
+ identifier: meta-llama/Llama-2-7b-hf
342
+ truncate_direction: right
343
+ save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
+ remote_save_folder: null
345
+ canceled_check_interval: 6000
346
+ save_interval: 3000
347
+ save_interval_unsharded: 3000
348
+ save_interval_ephemeral: null
349
+ save_num_checkpoints_to_keep: -1
350
+ save_num_unsharded_checkpoints_to_keep: -1
351
+ save_overwrite: true
352
+ force_save_unsharded: false
353
+ no_pre_train_checkpoint: false
354
+ load_path: checkpoints/OLMo-1B-as_fm3_omi2/step9000-unsharded
355
+ load_path_sharded_checkpointer: null
356
+ try_load_latest_save: false
357
+ reset_optimizer_state: false
358
+ reset_trainer_state: false
359
+ sharded_checkpointer: torch_legacy
360
+ new_style_checkpoints: null
361
+ max_duration: 1ep
362
+ global_train_batch_size: 512
363
+ device_train_batch_size: 128
364
+ device_train_microbatch_size: 16
365
+ device_eval_batch_size: 16
366
+ eval_subset_num_batches: -1
367
+ eval_on_load: false
368
+ device_train_grad_accum: 8
369
+ max_grad_norm: 1.0
370
+ max_grad_norm_ratio: null
371
+ precision: amp_bf16
372
+ wandb:
373
+ project: olmo-debug
374
+ entity: null
375
+ group: null
376
+ name: OLMo-1B-as_fm3_omi2
377
+ tags:
378
+ - watching
379
+ log_artifacts: false
380
+ rank_zero_only: true
381
+ log_interval: 1
382
+ speed_monitor:
383
+ window_size: 20
384
+ gpu_flops_available: null
385
+ console_log_interval: 1
386
+ gen1_gc_interval: 1
387
+ compile: null
388
+ distributed_strategy: fsdp
389
+ fsdp:
390
+ use_orig_params: true
391
+ sharding_strategy: FULL_SHARD
392
+ wrapping_strategy: null
393
+ precision: mixed
394
+ hybrid_sharding_num_model_replicas: null
395
+ ddp:
396
+ grad_sync_mode: batch
397
+ find_unused_params: false
398
+ single:
399
+ device: auto
400
+ softmax_auxiliary_loss: false
401
+ auxiliary_loss_multiplier: 0.0001
402
+ time_limit: null
403
+ extra_steps_after_cancel: 10
404
+ early_stopping_factor: null
405
+ save_data_indices: true
406
+ python_profiling: false
407
+ torch_profiling: false
408
+ stop_at: 62228
409
+ stop_after: null
410
+ activation_checkpointing: null
411
+ fused_loss: null
412
+ hf_datasets_cache_dir: null
413
+ module_outputs_save_steps: null
models/OLMo-1B/step18000/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.57.3"
6
+ }
models/OLMo-1B/step18000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b77268c42b79d7f03226ae0696a8826fdc4f183529b0f88c86315fb9bf7ff5bd
3
+ size 4557138549
models/OLMo-1B/step18000/rank0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4893148dbeec37f707599c433108ea28b587f97cced8fddfeada127f82ac3ad
3
+ size 3418099282