loubnabnl HF staff commited on
Commit
ba3014a
1 Parent(s): ce00ba4

Create brrr.yaml

Browse files
Files changed (1) hide show
  1. brrr.yaml +286 -0
brrr.yaml ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ name: 1b_starcoderdata_lr3
3
+ ignore_sanity_checks: true
4
+ kill_switch_path: /fsx/loubna/br4-experiments/kill_loubna_starcoder
5
+
6
+ profile: null
7
+ # profile:
8
+ # profiler_export_path: null # Can be a path
9
+
10
+ checkpoints:
11
+ checkpoints_path: /fsx/loubna/br4-experiments/checkpoints/debug/1b_star
12
+ load_from_specific_checkpoint: null
13
+ checkpoint_interval: 10000
14
+
15
+ parallelism:
16
+ dp: 64
17
+ pp: 1
18
+ tp: 1
19
+ pp_engine: 1f1b
20
+ tp_mode: REDUCE_SCATTER
21
+ tp_column_linear_async_communication: true
22
+ # recompute_granularity: selective
23
+
24
+ model:
25
+ hidden_size: 2048
26
+ num_attention_heads: 16
27
+ n_inner: 8192
28
+ n_layer: 24
29
+ max_position_embeddings: 8192
30
+ vocab_size: 49152
31
+ layer_norm_epsilon: 0.00001
32
+ scale_attn_weights: true
33
+ activation_function: gelu
34
+ attention_softmax_in_fp32: true
35
+ resid_pdrop: 0.1
36
+ attn_pdrop: 0.1
37
+ embd_pdrop: 0.1
38
+ pad_key_length: true
39
+ hf_gpt2_model_name: /fsx/loubna/starcoder-tokenizer/15b
40
+ make_vocab_size_divisible_by: 128
41
+ init_method:
42
+ std: 0.02209 # Basically 1/sqrt(N)
43
+ dtype: bfloat16
44
+ seed: 42
45
+
46
+
47
+ logging:
48
+ # 'debug', 'info', 'warning', 'error', 'critical' and 'passive'
49
+ log_level: 'info'
50
+ log_level_replica: 'info'
51
+ iteration_step_info_interval: 1
52
+ tensorboard_logger:
53
+ tensorboard_dir: /fsx/loubna/br4-experiments/tensorboard/debug
54
+
55
+ tokens:
56
+ sequence_length: 8192
57
+ train_steps: 150000
58
+ micro_batch_size: 1 # TODO @thomasw21
59
+ batch_accumulation_per_replica: 1 # TODO @thomasw21
60
+ val_check_interval: 2500
61
+ limit_val_batches: 2
62
+
63
+ optimizer:
64
+ zero_stage: 0
65
+ weight_decay: 0.1
66
+ clip_grad: 1.0
67
+
68
+ accumulate_grad_in_fp32: true
69
+
70
+ adam_eps: 1.0e-8
71
+ adam_beta1: 0.9
72
+ adam_beta2: 0.95 # Copied from LLaMa
73
+ learning_rate: 3.0e-4 # Copied from LLaMA
74
+
75
+ learning_rate_scheduler:
76
+ lr_warmup_steps: 2000
77
+ lr_warmup_style: linear
78
+ lr_decay_steps: 150000
79
+ lr_decay_style: cosine
80
+ min_decay_lr: 3.0e-5 # Copied from LLaMa
81
+
82
+ data:
83
+ seed: 1234 # mimick starcoder training
84
+ num_loading_workers: 2
85
+ dataset:
86
+ data_prefix:
87
+ - 3.0
88
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document
89
+ - 0.01
90
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document
91
+ - 53.89
92
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document
93
+ - 1.78
94
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document
95
+ - 0.85
96
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document
97
+ - 5.68
98
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document
99
+ - 0.01
100
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document
101
+ - 1.31
102
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document
103
+ - 0.98
104
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document
105
+ - 0.08
106
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document
107
+ - 0.03
108
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document
109
+ - 0.09
110
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document
111
+ - 1.12
112
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document
113
+ - 23.78
114
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document
115
+ - 0.7
116
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document
117
+ - 0.61
118
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document
119
+ - 0.26
120
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document
121
+ - 1.68
122
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document
123
+ - 2.23
124
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document
125
+ - 0.3
126
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document
127
+ - 0.31
128
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document
129
+ - 0.45
130
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document
131
+ - 0.12
132
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document
133
+ - 6.81
134
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document
135
+ - 9.11
136
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document
137
+ - 0.06
138
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document
139
+ - 44.66
140
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document
141
+ - 0.58
142
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document
143
+ - 2.23
144
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document
145
+ - 0.01
146
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document
147
+ - 1.25
148
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document
149
+ - 1.03
150
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document
151
+ - 1.31
152
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document
153
+ - 2.87
154
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document
155
+ - 0.01
156
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document
157
+ - 0.05
158
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document
159
+ - 3.32
160
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document
161
+ - 0.03
162
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document
163
+ - 0.19
164
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document
165
+ - 0.39
166
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document
167
+ - 5.2
168
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document
169
+ - 0.02
170
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document
171
+ - 1.56
172
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document
173
+ - 0.01
174
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document
175
+ - 0.07
176
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document
177
+ - 0.41
178
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document
179
+ - 3.66
180
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document
181
+ - 0.56
182
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document
183
+ - 0.03
184
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document
185
+ - 0.001
186
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document
187
+ - 0.23
188
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document
189
+ - 0.02
190
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document
191
+ - 0.01
192
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document
193
+ - 4.69
194
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document
195
+ - 0.35
196
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document
197
+ - 0.33
198
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document
199
+ - 0.01
200
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document
201
+ - 3.09
202
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document
203
+ - 0.46
204
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document
205
+ - 0.2
206
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document
207
+ - 0.05
208
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document
209
+ - 0.04
210
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document
211
+ - 11.09
212
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document
213
+ - 0.4
214
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document
215
+ - 0.3
216
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document
217
+ - 0.42
218
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document
219
+ - 48.92
220
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document
221
+ - 0.64
222
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document
223
+ - 1.4
224
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document
225
+ - 0.71
226
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document
227
+ - 0.91
228
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document
229
+ - 29.36
230
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document
231
+ - 86.94
232
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document
233
+ - 64.71
234
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document
235
+ - 74.93
236
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document
237
+ - 60.89
238
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document
239
+ - 60.4
240
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document
241
+ - 26.52
242
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document
243
+ - 0.001
244
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document
245
+ - 1.42
246
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document
247
+ - 0.94
248
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document
249
+ - 0.01
250
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document
251
+ - 0.0002
252
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document
253
+ - 0.11
254
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document
255
+ - 0.18
256
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document
257
+ - 0.05
258
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document
259
+ - 1.0
260
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document
261
+ - 1.0
262
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document
263
+ - 54.4
264
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document
265
+ - 32.0
266
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document
267
+ - 7.12
268
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document
269
+ - 6.0
270
+ - /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document
271
+ index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
272
+ splits_string: 0.969,0.999,1 # TODO @thomasw21: We should probably define a split per dataset instead of setting them at a global scale
273
+ skip_warmup: true
274
+ dataloader_type: single # cyclic
275
+ validation_drop_last: true # Set to false if the last partial validation samples is to be consumed
276
+ eod_mask_loss: false # Mask loss for the end of document tokens
277
+ no_seqlen_plus_one_input_tokens: false # Set to true to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
278
+ pad_samples_to_global_batch_size: false # Set to true if you want to pad the last partial batch with -1's to equal global batch size
279
+ # dataset:
280
+ # hf_dataset_name: stas/openwebtext-10k
281
+ # hf_dataset_config_name: null
282
+ # hf_dataset_split: train
283
+ # dataset_processing_num_proc_per_process: 12
284
+ # dataset_overwrite_cache: true
285
+ # text_column_name: text
286
+ #