onkarpandit-g42 commited on
Commit
b7bdf95
1 Parent(s): b5cf495

Upload params_train.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. params_train.yaml +235 -0
params_train.yaml ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_input:
2
+ batch_size: 976
3
+ data_processor: GptHDF5MapDataProcessor
4
+ mixture:
5
+ - data_dir: /cra-406/datasets/pile/multilingual_v2/train_correct/
6
+ weight: 0.6510508179774476
7
+ - data_dir: /cra-406/datasets/github/multilingual_v2/packed_2k/train
8
+ weight: 0.055087602323960365
9
+ - data_dir: /cra-406/datasets/books3_arabic/multilingual_v2/packed_2k/books_3_arabic_train_correct_packed
10
+ weight: 0.031560734650858936
11
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/alkhair_train_packed
12
+ weight: 0.0008441127388845985
13
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/aranews_train_packed
14
+ weight: 0.00015702987060793174
15
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/baai_train_packed
16
+ weight: 0.02652363386071335
17
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/C4_train_packed
18
+ weight: 0.04370135940994404
19
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/ccnews_train_packed
20
+ weight: 0.006820988629070355
21
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/common_crawl_train_packed
22
+ weight: 0.16413286051785408
23
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/daypop_train_packed
24
+ weight: 0.001772579714458703
25
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en2ar_wikipedia_train_packed
26
+ weight: 0.006335165657431352
27
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en_wikipedia_train_packed
28
+ weight: 0.0035095904892209306
29
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/maktabah_train_packed
30
+ weight: 0.002642036817637927
31
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/misc_train_packed
32
+ weight: 6.954077746676907e-05
33
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/osian_train_packed
34
+ weight: 0.0006243331144143421
35
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/un_train_packed
36
+ weight: 0.001005513115682201
37
+ - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/wikipedia_train_packed
38
+ weight: 0.00034892678537459647
39
+ - data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/ar/train_shuffled
40
+ weight: 0.0012430476743474177
41
+ - data_dir: /cra-406/datasets/multilingual_v2/itc_663m_split/train_shuffled
42
+ weight: 0.0013597894242614768
43
+ - data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/en/train_shuffled
44
+ weight: 0.0012103364503629503
45
+ num_workers: 1
46
+ persistent_workers: true
47
+ prefetch_factor: 10
48
+ repeat: true
49
+ shuffle: false
50
+ shuffle_seed: 1
51
+ use_worker_cache: false
52
+ vocab_size: 84992
53
+ eval_input:
54
+ batch_size: 32
55
+ data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed
56
+ data_processor: GptHDF5MapDataProcessor
57
+ num_workers: 1
58
+ repeat: false
59
+ shuffle: false
60
+ use_worker_cache: false
61
+ vocab_size: 84992
62
+ model:
63
+ mixed_precision: true
64
+ fp16_type: cbfloat16
65
+ boundary_casting: false
66
+ lora_params: null
67
+ vocab_size: 84992
68
+ embedding_layer_norm: false
69
+ embedding_dropout_rate: 0.0
70
+ share_embedding_weights: true
71
+ position_embedding_type: alibi
72
+ max_position_embeddings: 2048
73
+ position_embedding_offset: 0
74
+ num_relative_attention_buckets: 32
75
+ rotary_dim: null
76
+ rope_theta: 10000
77
+ pad_rope: false
78
+ alibi_trainable_slopes: false
79
+ pos_scaling_factor: 1.0
80
+ hidden_size: 1088
81
+ num_hidden_layers: 14
82
+ dropout_rate: 0.0
83
+ norm_type: layernorm
84
+ layer_norm_epsilon: 1.0e-05
85
+ num_heads: 17
86
+ attention_module: aiayn_attention
87
+ extra_attention_params: {}
88
+ attention_type: scaled_dot_product
89
+ attention_dropout_rate: 0.0
90
+ use_projection_bias_in_attention: true
91
+ use_ffn_bias_in_attention: true
92
+ attention_softmax_fp32: false
93
+ attention_kernel: optimized_beta
94
+ attention_sliding_window_length: null
95
+ scale_qk_dot_by_layer_idx: false
96
+ fixed_sparse_attention: null
97
+ filter_size: 2912
98
+ nonlinearity: swiglu
99
+ use_ffn_bias: true
100
+ use_bias_in_output: false
101
+ loss_scaling: num_tokens
102
+ loss_weight: 1.0
103
+ embeddings_scale: 9.1705785388303
104
+ scale_qk_dot_by_d: true
105
+ output_logits_scale: 0.2576902348606329
106
+ initializer:
107
+ name: truncated_normal
108
+ mean: 0.0
109
+ std: 0.04203434605680388
110
+ a: -0.08406869211360776
111
+ b: 0.08406869211360776
112
+ nonlinearity: null
113
+ mode: null
114
+ scale: null
115
+ distribution: null
116
+ initializer_range: 0.02
117
+ embedding_initializer:
118
+ name: truncated_normal
119
+ mean: 0.0
120
+ std: 0.0866560243479838
121
+ a: -0.1733120486959676
122
+ b: 0.1733120486959676
123
+ nonlinearity: null
124
+ mode: null
125
+ scale: null
126
+ distribution: null
127
+ output_layer_initializer:
128
+ name: truncated_normal
129
+ mean: 0.0
130
+ std: 0.007943744727823684
131
+ a: -0.015887489455647368
132
+ b: 0.015887489455647368
133
+ nonlinearity: null
134
+ mode: null
135
+ scale: null
136
+ distribution: null
137
+ compute_eval_metrics: true
138
+ sparsity: null
139
+ optimizer:
140
+ optimizer_type: AdamW
141
+ weight_decay: 0.1
142
+ log_summaries: true
143
+ loss_scaling_factor: dynamic
144
+ learning_rate:
145
+ - end_learning_rate: 0.015625
146
+ initial_learning_rate: 0.0
147
+ scheduler: Linear
148
+ total_iters: 187
149
+ - end_learning_rate: 1.9196e-05
150
+ initial_learning_rate: 0.015625
151
+ scheduler: Linear
152
+ total_iters: 240133
153
+ max_gradient_norm: 1.0
154
+ adjust_learning_rate:
155
+ decoder_kernel: 0.23529411764705882
156
+ betas:
157
+ - 0.9
158
+ - 0.95
159
+ correct_bias: true
160
+ eps: 1.0e-08
161
+ runconfig:
162
+ steps_per_epoch: null
163
+ max_steps: 240320
164
+ mgmt_address: null
165
+ mount_dirs:
166
+ - /cra-406
167
+ num_epochs: null
168
+ python_paths:
169
+ - /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src
170
+ compile_dir: null
171
+ checkpoint_path: null
172
+ credentials_path: null
173
+ debug_args_path: null
174
+ retrace_every_iteration: null
175
+ eval_steps: 5219
176
+ init_method: env://
177
+ job_time_sec: null
178
+ job_labels:
179
+ - Name=Neha_Sengupta
180
+ - Organization=Inception
181
+ - Model=Jais_256M
182
+ - Mode=Train
183
+ - Num_CSX=8
184
+ - Language=Bilingual
185
+ - Type=Train
186
+ - Dataset=AraV5_Pile_Github_Books_UAE_ITC
187
+ job_priority: p2
188
+ seed: 1
189
+ mgmt_namespace: cra-406
190
+ load_checkpoint_states: model,optimizer,global_step,dataloader,lr_scheduler
191
+ target_device: CSX
192
+ mode: train
193
+ wsc_log_level: null
194
+ autoload_last_checkpoint: true
195
+ check_loss_values: true
196
+ disable_strict_checkpoint_loading: null
197
+ dist_addr: localhost:8888
198
+ dist_backend: nccl
199
+ checkpoint_steps: 24032
200
+ disable_version_check: null
201
+ drop_data: false
202
+ enable_distributed: false
203
+ model_dir: artifacts/model_dir_256M
204
+ save_initial_checkpoint: false
205
+ precision_opt_level: 1
206
+ num_workers_per_csx: 0
207
+ validate_only: null
208
+ logging: null
209
+ sync_batchnorm: false
210
+ compile_only: null
211
+ log_steps: 1
212
+ num_steps: null
213
+ transfer_processes: null
214
+ num_wgt_servers: null
215
+ num_csx: 8
216
+ num_act_servers: null
217
+ eval_frequency: null
218
+ execute_crd_memory_gi: null
219
+ compile_crd_memory_gi: null
220
+ op_profiler_config: null
221
+ dump_activations: false
222
+ log_input_summaries: false
223
+ main_process_id: 0
224
+ max_checkpoints: 100000
225
+ summary_dir: null
226
+ lazy_initialization: true
227
+ use_cstorch_optimizer_step: false
228
+ wrk_memory_gi: null
229
+ act_memory_gi: null
230
+ cmd_memory_gi: null
231
+ wgt_memory_gi: null
232
+ experimental: {}
233
+ ini:
234
+ ws_opt_speculate_optimizer: true
235
+ debug_args: null