stefan-it commited on
Commit
f21ca04
1 Parent(s): f601a0b

gin: add pre-training config

Browse files
Files changed (1) hide show
  1. operative_config.gin +363 -0
operative_config.gin ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset as mesh_tensorflow2
3
+ import mesh_tensorflow.transformer.learning_rate_schedules as mesh_tensorflow3
4
+ import mesh_tensorflow.transformer.t2t_vocabulary as mesh_tensorflow4
5
+ import mesh_tensorflow.transformer.transformer as mesh_tensorflow5
6
+ import mesh_tensorflow.transformer.transformer_layers as mesh_tensorflow6
7
+ import mesh_tensorflow.transformer.utils as mesh_tensorflow7
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 3072
13
+ d_kv = 64
14
+ d_model = 768
15
+ dropout_rate = 0.0
16
+ inputs_length = 512
17
+ mean_noise_span_length = 3.0
18
+ MIXTURE_NAME = 'gc4_corpus'
19
+ noise_density = 0.15
20
+ num_heads = 12
21
+ num_layers = 36
22
+
23
+ # Parameters for adafactor_decay_rate_pow:
24
+ # ==============================================================================
25
+ adafactor_decay_rate_pow.offset = 0
26
+
27
+ # Parameters for AdafactorOptimizer:
28
+ # ==============================================================================
29
+ AdafactorOptimizer.beta1 = 0.0
30
+ AdafactorOptimizer.clipping_threshold = 1.0
31
+ AdafactorOptimizer.decay_rate = None
32
+ AdafactorOptimizer.epsilon1 = 1e-30
33
+ AdafactorOptimizer.epsilon2 = 0.001
34
+ AdafactorOptimizer.factored = True
35
+ AdafactorOptimizer.min_dim_size_to_factor = 128
36
+ AdafactorOptimizer.multiply_by_parameter_scale = True
37
+
38
+ # Parameters for Bitransformer:
39
+ # ==============================================================================
40
+ Bitransformer.shared_embedding = True
41
+
42
+ # Parameters for denoise:
43
+ # ==============================================================================
44
+ denoise.passthrough_feature_keys = None
45
+
46
+ # Parameters for decoder/DenseReluDense:
47
+ # ==============================================================================
48
+ decoder/DenseReluDense.activation = 'relu'
49
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
50
+ decoder/DenseReluDense.hidden_size = %d_ff
51
+ decoder/DenseReluDense.use_bias = False
52
+
53
+ # Parameters for encoder/DenseReluDense:
54
+ # ==============================================================================
55
+ encoder/DenseReluDense.activation = 'relu'
56
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
57
+ encoder/DenseReluDense.hidden_size = %d_ff
58
+ encoder/DenseReluDense.use_bias = False
59
+
60
+ # Parameters for enc_dec_attention:
61
+ # ==============================================================================
62
+ # None.
63
+
64
+ # Parameters for enc_dec_attention_bias:
65
+ # ==============================================================================
66
+ # None.
67
+
68
+ # Parameters for decoder/EncDecAttention:
69
+ # ==============================================================================
70
+ decoder/EncDecAttention.relative_attention_type = None
71
+
72
+ # Parameters for get_variable_dtype:
73
+ # ==============================================================================
74
+ get_variable_dtype.activation_dtype = 'bfloat16'
75
+
76
+ # Parameters for get_vocab_embedding_cls:
77
+ # ==============================================================================
78
+ # None.
79
+
80
+ # Parameters for get_vocabulary:
81
+ # ==============================================================================
82
+ get_vocabulary.mixture_or_task_name = %MIXTURE_NAME
83
+
84
+ # Parameters for decoder/LayerStack:
85
+ # ==============================================================================
86
+ decoder/LayerStack.dropout_rate = None
87
+ decoder/LayerStack.norm_epsilon = None
88
+ decoder/LayerStack.recompute_grads = False
89
+ decoder/LayerStack.sublayers_final = \
90
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
91
+ decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
92
+ decoder/LayerStack.sublayers_per_layer = \
93
+ [@transformer.sublayer_rms_norm,
94
+ @transformer.sublayer_call_layer,
95
+ @transformer.sublayer_dropout,
96
+ @transformer.sublayer_residual]
97
+
98
+ # Parameters for encoder/LayerStack:
99
+ # ==============================================================================
100
+ encoder/LayerStack.dropout_rate = None
101
+ encoder/LayerStack.norm_epsilon = None
102
+ encoder/LayerStack.recompute_grads = False
103
+ encoder/LayerStack.sublayers_final = \
104
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
105
+ encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
106
+ encoder/LayerStack.sublayers_per_layer = \
107
+ [@transformer.sublayer_rms_norm,
108
+ @transformer.sublayer_call_layer,
109
+ @transformer.sublayer_dropout,
110
+ @transformer.sublayer_residual]
111
+
112
+ # Parameters for learning_rate_schedule_noam:
113
+ # ==============================================================================
114
+ learning_rate_schedule_noam.linear_decay_fraction = 0.0
115
+ learning_rate_schedule_noam.multiplier = 1.0
116
+ learning_rate_schedule_noam.offset = 0
117
+ learning_rate_schedule_noam.warmup_steps = 10000
118
+
119
+ # Parameters for make_bitransformer:
120
+ # ==============================================================================
121
+ make_bitransformer.decoder_name = 'decoder'
122
+ make_bitransformer.encoder_name = 'encoder'
123
+
124
+ # Parameters for decoder/make_layer_stack:
125
+ # ==============================================================================
126
+ decoder/make_layer_stack.block_scope = True
127
+ decoder/make_layer_stack.layers = \
128
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
129
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
130
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
131
+ decoder/make_layer_stack.num_layers = %num_layers
132
+
133
+ # Parameters for encoder/make_layer_stack:
134
+ # ==============================================================================
135
+ encoder/make_layer_stack.block_scope = True
136
+ encoder/make_layer_stack.layers = \
137
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
138
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
139
+ encoder/make_layer_stack.num_layers = %num_layers
140
+
141
+ # Parameters for mesh_train_dataset_fn:
142
+ # ==============================================================================
143
+ mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
144
+ mesh_train_dataset_fn.pack = True
145
+ mesh_train_dataset_fn.seed = None
146
+ mesh_train_dataset_fn.shuffle = True
147
+ mesh_train_dataset_fn.use_cached = False
148
+
149
+ # Parameters for noise_span_to_unique_sentinel:
150
+ # ==============================================================================
151
+ # None.
152
+
153
+ # Parameters for nonnoise_span_to_unique_sentinel:
154
+ # ==============================================================================
155
+ # None.
156
+
157
+ # Parameters for pack_dataset:
158
+ # ==============================================================================
159
+ pack_dataset.use_custom_ops = False
160
+
161
+ # Parameters for pack_or_pad:
162
+ # ==============================================================================
163
+ # None.
164
+
165
+ # Parameters for random_spans_helper:
166
+ # ==============================================================================
167
+ random_spans_helper.extra_tokens_per_span_inputs = 1
168
+ random_spans_helper.extra_tokens_per_span_targets = 1
169
+ random_spans_helper.inputs_length = %inputs_length
170
+ random_spans_helper.mean_noise_span_length = %mean_noise_span_length
171
+ random_spans_helper.noise_density = %noise_density
172
+ random_spans_helper.verbose = False
173
+
174
+ # Parameters for random_spans_noise_mask:
175
+ # ==============================================================================
176
+ # None.
177
+
178
+ # Parameters for random_spans_tokens_length:
179
+ # ==============================================================================
180
+ # None.
181
+
182
+ # Parameters for reduce_concat_tokens:
183
+ # ==============================================================================
184
+ # None.
185
+
186
+ # Parameters for rewrite_stack_variables:
187
+ # ==============================================================================
188
+ rewrite_stack_variables.max_combined_variable_size = 536870912
189
+
190
+ # Parameters for run:
191
+ # ==============================================================================
192
+ run.autostack = True
193
+ run.batch_size = ('tokens_per_batch', 65536)
194
+ run.checkpoint_input_pipeline = False
195
+ run.dataset_split = 'train'
196
+ run.ensemble_inputs = None
197
+ run.eval_checkpoint_step = None
198
+ run.eval_dataset_fn = None
199
+ run.eval_summary_dir = None
200
+ run.export_checkpoint_step = None
201
+ run.export_path = ''
202
+ run.init_checkpoint = None
203
+ run.iterations_per_loop = 100
204
+ run.keep_checkpoint_max = None
205
+ run.layout_rules = \
206
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
207
+ run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam
208
+ run.mesh_devices = None
209
+ run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
210
+ run.mode = 'train'
211
+ run.model_type = 'bitransformer'
212
+ run.optimizer = @optimize.AdafactorOptimizer
213
+ run.output_eval_examples = True
214
+ run.perplexity_eval_steps = 100
215
+ run.predict_fn = None
216
+ run.save_checkpoints_steps = 50000
217
+ run.seen_data_init_step = 0
218
+ run.sequence_length = {'inputs': 512, 'targets': 128}
219
+ run.skip_seen_data = False
220
+ run.total_run_steps = None
221
+ run.train_dataset_fn = @t5.models.mesh_transformer.mesh_train_dataset_fn
222
+ run.train_steps = 524288
223
+ run.variable_filter = None
224
+
225
+ # Parameters for select_random_chunk:
226
+ # ==============================================================================
227
+ select_random_chunk.additional_feature_keys = None
228
+ select_random_chunk.additional_passthrough_keys = None
229
+ select_random_chunk.min_length = None
230
+ select_random_chunk.passthrough_feature_keys = None
231
+ select_random_chunk.sequence_length = None
232
+ select_random_chunk.uniform_random_start = False
233
+
234
+ # Parameters for decoder/SelfAttention:
235
+ # ==============================================================================
236
+ decoder/SelfAttention.attention_func = None
237
+ decoder/SelfAttention.attention_kwargs = None
238
+ decoder/SelfAttention.combine_dims = True
239
+ decoder/SelfAttention.dropout_rate = %dropout_rate
240
+ decoder/SelfAttention.fold_scaling_into_initializer = True
241
+ decoder/SelfAttention.keep_query_heads_dims = False
242
+ decoder/SelfAttention.key_value_size = %d_kv
243
+ decoder/SelfAttention.num_heads = %num_heads
244
+ decoder/SelfAttention.num_memory_heads = 0
245
+ decoder/SelfAttention.relative_attention_num_buckets = 32
246
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
247
+ decoder/SelfAttention.shared_kv = False
248
+
249
+ # Parameters for encoder/SelfAttention:
250
+ # ==============================================================================
251
+ encoder/SelfAttention.attention_func = None
252
+ encoder/SelfAttention.attention_kwargs = None
253
+ encoder/SelfAttention.combine_dims = True
254
+ encoder/SelfAttention.dropout_rate = %dropout_rate
255
+ encoder/SelfAttention.fold_scaling_into_initializer = True
256
+ encoder/SelfAttention.keep_query_heads_dims = False
257
+ encoder/SelfAttention.key_value_size = %d_kv
258
+ encoder/SelfAttention.num_heads = %num_heads
259
+ encoder/SelfAttention.num_memory_heads = 0
260
+ encoder/SelfAttention.relative_attention_num_buckets = 32
261
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
262
+ encoder/SelfAttention.shared_kv = False
263
+
264
+ # Parameters for sentinel_id:
265
+ # ==============================================================================
266
+ sentinel_id.return_value = None
267
+
268
+ # Parameters for serialize_num_microbatches:
269
+ # ==============================================================================
270
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
271
+
272
+ # Parameters for SimdMeshImpl:
273
+ # ==============================================================================
274
+ SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8
275
+
276
+ # Parameters for split_tokens:
277
+ # ==============================================================================
278
+ split_tokens.additional_feature_keys = None
279
+ split_tokens.num_parallel_calls = -1
280
+ split_tokens.passthrough_feature_keys = None
281
+
282
+ # Parameters for sublayer_call_layer:
283
+ # ==============================================================================
284
+ # None.
285
+
286
+ # Parameters for sublayer_dropout:
287
+ # ==============================================================================
288
+ sublayer_dropout.dropout_rate = %dropout_rate
289
+
290
+ # Parameters for sublayer_mask_padding:
291
+ # ==============================================================================
292
+ # None.
293
+
294
+ # Parameters for sublayer_residual:
295
+ # ==============================================================================
296
+ # None.
297
+
298
+ # Parameters for sublayer_rms_norm:
299
+ # ==============================================================================
300
+ sublayer_rms_norm.epsilon = 1e-06
301
+ sublayer_rms_norm.name = 'rms_norm'
302
+
303
+ # Parameters for tpu_estimator_model_fn:
304
+ # ==============================================================================
305
+ tpu_estimator_model_fn.hierarchical_tiling_spec = None
306
+ tpu_estimator_model_fn.init_variable_filter = ''
307
+ tpu_estimator_model_fn.model_info_file = ''
308
+ tpu_estimator_model_fn.outer_batch_size = 1
309
+ tpu_estimator_model_fn.tpu_summaries = False
310
+
311
+ # Parameters for tpu_mesh_shape:
312
+ # ==============================================================================
313
+ tpu_mesh_shape.ensemble_parallelism = None
314
+ tpu_mesh_shape.model_parallelism = 1
315
+ tpu_mesh_shape.tpu_topology = 'v3-8'
316
+
317
+ # Parameters for unit_scaling_convention:
318
+ # ==============================================================================
319
+ unit_scaling_convention.value = False
320
+
321
+ # Parameters for decoder/Unitransformer:
322
+ # ==============================================================================
323
+ decoder/Unitransformer.d_model = %d_model
324
+ decoder/Unitransformer.ensemble = None
325
+ decoder/Unitransformer.input_full_attention = False
326
+ decoder/Unitransformer.label_smoothing = 0.0
327
+ decoder/Unitransformer.loss_denominator = None
328
+ decoder/Unitransformer.loss_fn = None
329
+ decoder/Unitransformer.loss_on_targets_only = False
330
+ decoder/Unitransformer.max_length = 512
331
+ decoder/Unitransformer.positional_embedding = False
332
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
333
+ decoder/Unitransformer.sinusoid_positional_embedding = False
334
+ decoder/Unitransformer.token_dropout_rate = 0.0
335
+ decoder/Unitransformer.vocab_divisor = 128
336
+ decoder/Unitransformer.z_loss = 0.0001
337
+
338
+ # Parameters for encoder/Unitransformer:
339
+ # ==============================================================================
340
+ encoder/Unitransformer.d_model = %d_model
341
+ encoder/Unitransformer.ensemble = None
342
+ encoder/Unitransformer.input_full_attention = False
343
+ encoder/Unitransformer.label_smoothing = 0.0
344
+ encoder/Unitransformer.loss_denominator = None
345
+ encoder/Unitransformer.loss_fn = None
346
+ encoder/Unitransformer.loss_on_targets_only = False
347
+ encoder/Unitransformer.max_length = 512
348
+ encoder/Unitransformer.positional_embedding = False
349
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
350
+ encoder/Unitransformer.sinusoid_positional_embedding = False
351
+ encoder/Unitransformer.token_dropout_rate = 0.0
352
+ encoder/Unitransformer.vocab_divisor = 128
353
+ encoder/Unitransformer.z_loss = 0.0001
354
+
355
+ # Parameters for VarianceScalingInitializer:
356
+ # ==============================================================================
357
+ VarianceScalingInitializer.distribution = 'normal'
358
+ VarianceScalingInitializer.mode = 'fan_in'
359
+ VarianceScalingInitializer.scale = 1.0
360
+
361
+ # Parameters for VocabEmbedding:
362
+ # ==============================================================================
363
+ VocabEmbedding.scale_variable_like_classifier_weights = False