onkarpandit-g42
commited on
Upload params_train.yaml with huggingface_hub
Browse files- params_train.yaml +235 -0
params_train.yaml
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
train_input:
|
2 |
+
batch_size: 976
|
3 |
+
data_processor: GptHDF5MapDataProcessor
|
4 |
+
mixture:
|
5 |
+
- data_dir: /cra-406/datasets/pile/multilingual_v2/train_correct/
|
6 |
+
weight: 0.6510508179774476
|
7 |
+
- data_dir: /cra-406/datasets/github/multilingual_v2/packed_2k/train
|
8 |
+
weight: 0.055087602323960365
|
9 |
+
- data_dir: /cra-406/datasets/books3_arabic/multilingual_v2/packed_2k/books_3_arabic_train_correct_packed
|
10 |
+
weight: 0.031560734650858936
|
11 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/alkhair_train_packed
|
12 |
+
weight: 0.0008441127388845985
|
13 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/aranews_train_packed
|
14 |
+
weight: 0.00015702987060793174
|
15 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/baai_train_packed
|
16 |
+
weight: 0.02652363386071335
|
17 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/C4_train_packed
|
18 |
+
weight: 0.04370135940994404
|
19 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/ccnews_train_packed
|
20 |
+
weight: 0.006820988629070355
|
21 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/common_crawl_train_packed
|
22 |
+
weight: 0.16413286051785408
|
23 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/daypop_train_packed
|
24 |
+
weight: 0.001772579714458703
|
25 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en2ar_wikipedia_train_packed
|
26 |
+
weight: 0.006335165657431352
|
27 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en_wikipedia_train_packed
|
28 |
+
weight: 0.0035095904892209306
|
29 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/maktabah_train_packed
|
30 |
+
weight: 0.002642036817637927
|
31 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/misc_train_packed
|
32 |
+
weight: 6.954077746676907e-05
|
33 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/osian_train_packed
|
34 |
+
weight: 0.0006243331144143421
|
35 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/un_train_packed
|
36 |
+
weight: 0.001005513115682201
|
37 |
+
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/wikipedia_train_packed
|
38 |
+
weight: 0.00034892678537459647
|
39 |
+
- data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/ar/train_shuffled
|
40 |
+
weight: 0.0012430476743474177
|
41 |
+
- data_dir: /cra-406/datasets/multilingual_v2/itc_663m_split/train_shuffled
|
42 |
+
weight: 0.0013597894242614768
|
43 |
+
- data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/en/train_shuffled
|
44 |
+
weight: 0.0012103364503629503
|
45 |
+
num_workers: 1
|
46 |
+
persistent_workers: true
|
47 |
+
prefetch_factor: 10
|
48 |
+
repeat: true
|
49 |
+
shuffle: false
|
50 |
+
shuffle_seed: 1
|
51 |
+
use_worker_cache: false
|
52 |
+
vocab_size: 84992
|
53 |
+
eval_input:
|
54 |
+
batch_size: 32
|
55 |
+
data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed
|
56 |
+
data_processor: GptHDF5MapDataProcessor
|
57 |
+
num_workers: 1
|
58 |
+
repeat: false
|
59 |
+
shuffle: false
|
60 |
+
use_worker_cache: false
|
61 |
+
vocab_size: 84992
|
62 |
+
model:
|
63 |
+
mixed_precision: true
|
64 |
+
fp16_type: cbfloat16
|
65 |
+
boundary_casting: false
|
66 |
+
lora_params: null
|
67 |
+
vocab_size: 84992
|
68 |
+
embedding_layer_norm: false
|
69 |
+
embedding_dropout_rate: 0.0
|
70 |
+
share_embedding_weights: true
|
71 |
+
position_embedding_type: alibi
|
72 |
+
max_position_embeddings: 2048
|
73 |
+
position_embedding_offset: 0
|
74 |
+
num_relative_attention_buckets: 32
|
75 |
+
rotary_dim: null
|
76 |
+
rope_theta: 10000
|
77 |
+
pad_rope: false
|
78 |
+
alibi_trainable_slopes: false
|
79 |
+
pos_scaling_factor: 1.0
|
80 |
+
hidden_size: 1088
|
81 |
+
num_hidden_layers: 14
|
82 |
+
dropout_rate: 0.0
|
83 |
+
norm_type: layernorm
|
84 |
+
layer_norm_epsilon: 1.0e-05
|
85 |
+
num_heads: 17
|
86 |
+
attention_module: aiayn_attention
|
87 |
+
extra_attention_params: {}
|
88 |
+
attention_type: scaled_dot_product
|
89 |
+
attention_dropout_rate: 0.0
|
90 |
+
use_projection_bias_in_attention: true
|
91 |
+
use_ffn_bias_in_attention: true
|
92 |
+
attention_softmax_fp32: false
|
93 |
+
attention_kernel: optimized_beta
|
94 |
+
attention_sliding_window_length: null
|
95 |
+
scale_qk_dot_by_layer_idx: false
|
96 |
+
fixed_sparse_attention: null
|
97 |
+
filter_size: 2912
|
98 |
+
nonlinearity: swiglu
|
99 |
+
use_ffn_bias: true
|
100 |
+
use_bias_in_output: false
|
101 |
+
loss_scaling: num_tokens
|
102 |
+
loss_weight: 1.0
|
103 |
+
embeddings_scale: 9.1705785388303
|
104 |
+
scale_qk_dot_by_d: true
|
105 |
+
output_logits_scale: 0.2576902348606329
|
106 |
+
initializer:
|
107 |
+
name: truncated_normal
|
108 |
+
mean: 0.0
|
109 |
+
std: 0.04203434605680388
|
110 |
+
a: -0.08406869211360776
|
111 |
+
b: 0.08406869211360776
|
112 |
+
nonlinearity: null
|
113 |
+
mode: null
|
114 |
+
scale: null
|
115 |
+
distribution: null
|
116 |
+
initializer_range: 0.02
|
117 |
+
embedding_initializer:
|
118 |
+
name: truncated_normal
|
119 |
+
mean: 0.0
|
120 |
+
std: 0.0866560243479838
|
121 |
+
a: -0.1733120486959676
|
122 |
+
b: 0.1733120486959676
|
123 |
+
nonlinearity: null
|
124 |
+
mode: null
|
125 |
+
scale: null
|
126 |
+
distribution: null
|
127 |
+
output_layer_initializer:
|
128 |
+
name: truncated_normal
|
129 |
+
mean: 0.0
|
130 |
+
std: 0.007943744727823684
|
131 |
+
a: -0.015887489455647368
|
132 |
+
b: 0.015887489455647368
|
133 |
+
nonlinearity: null
|
134 |
+
mode: null
|
135 |
+
scale: null
|
136 |
+
distribution: null
|
137 |
+
compute_eval_metrics: true
|
138 |
+
sparsity: null
|
139 |
+
optimizer:
|
140 |
+
optimizer_type: AdamW
|
141 |
+
weight_decay: 0.1
|
142 |
+
log_summaries: true
|
143 |
+
loss_scaling_factor: dynamic
|
144 |
+
learning_rate:
|
145 |
+
- end_learning_rate: 0.015625
|
146 |
+
initial_learning_rate: 0.0
|
147 |
+
scheduler: Linear
|
148 |
+
total_iters: 187
|
149 |
+
- end_learning_rate: 1.9196e-05
|
150 |
+
initial_learning_rate: 0.015625
|
151 |
+
scheduler: Linear
|
152 |
+
total_iters: 240133
|
153 |
+
max_gradient_norm: 1.0
|
154 |
+
adjust_learning_rate:
|
155 |
+
decoder_kernel: 0.23529411764705882
|
156 |
+
betas:
|
157 |
+
- 0.9
|
158 |
+
- 0.95
|
159 |
+
correct_bias: true
|
160 |
+
eps: 1.0e-08
|
161 |
+
runconfig:
|
162 |
+
steps_per_epoch: null
|
163 |
+
max_steps: 240320
|
164 |
+
mgmt_address: null
|
165 |
+
mount_dirs:
|
166 |
+
- /cra-406
|
167 |
+
num_epochs: null
|
168 |
+
python_paths:
|
169 |
+
- /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src
|
170 |
+
compile_dir: null
|
171 |
+
checkpoint_path: null
|
172 |
+
credentials_path: null
|
173 |
+
debug_args_path: null
|
174 |
+
retrace_every_iteration: null
|
175 |
+
eval_steps: 5219
|
176 |
+
init_method: env://
|
177 |
+
job_time_sec: null
|
178 |
+
job_labels:
|
179 |
+
- Name=Neha_Sengupta
|
180 |
+
- Organization=Inception
|
181 |
+
- Model=Jais_256M
|
182 |
+
- Mode=Train
|
183 |
+
- Num_CSX=8
|
184 |
+
- Language=Bilingual
|
185 |
+
- Type=Train
|
186 |
+
- Dataset=AraV5_Pile_Github_Books_UAE_ITC
|
187 |
+
job_priority: p2
|
188 |
+
seed: 1
|
189 |
+
mgmt_namespace: cra-406
|
190 |
+
load_checkpoint_states: model,optimizer,global_step,dataloader,lr_scheduler
|
191 |
+
target_device: CSX
|
192 |
+
mode: train
|
193 |
+
wsc_log_level: null
|
194 |
+
autoload_last_checkpoint: true
|
195 |
+
check_loss_values: true
|
196 |
+
disable_strict_checkpoint_loading: null
|
197 |
+
dist_addr: localhost:8888
|
198 |
+
dist_backend: nccl
|
199 |
+
checkpoint_steps: 24032
|
200 |
+
disable_version_check: null
|
201 |
+
drop_data: false
|
202 |
+
enable_distributed: false
|
203 |
+
model_dir: artifacts/model_dir_256M
|
204 |
+
save_initial_checkpoint: false
|
205 |
+
precision_opt_level: 1
|
206 |
+
num_workers_per_csx: 0
|
207 |
+
validate_only: null
|
208 |
+
logging: null
|
209 |
+
sync_batchnorm: false
|
210 |
+
compile_only: null
|
211 |
+
log_steps: 1
|
212 |
+
num_steps: null
|
213 |
+
transfer_processes: null
|
214 |
+
num_wgt_servers: null
|
215 |
+
num_csx: 8
|
216 |
+
num_act_servers: null
|
217 |
+
eval_frequency: null
|
218 |
+
execute_crd_memory_gi: null
|
219 |
+
compile_crd_memory_gi: null
|
220 |
+
op_profiler_config: null
|
221 |
+
dump_activations: false
|
222 |
+
log_input_summaries: false
|
223 |
+
main_process_id: 0
|
224 |
+
max_checkpoints: 100000
|
225 |
+
summary_dir: null
|
226 |
+
lazy_initialization: true
|
227 |
+
use_cstorch_optimizer_step: false
|
228 |
+
wrk_memory_gi: null
|
229 |
+
act_memory_gi: null
|
230 |
+
cmd_memory_gi: null
|
231 |
+
wgt_memory_gi: null
|
232 |
+
experimental: {}
|
233 |
+
ini:
|
234 |
+
ws_opt_speculate_optimizer: true
|
235 |
+
debug_args: null
|