patrickvonplaten commited on
Commit
d7b60d5
1 Parent(s): 81e4881

Upload model

Browse files
Files changed (1) hide show
  1. operative_config.gin +372 -0
operative_config.gin ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer
6
+ import mesh_tensorflow.transformer.transformer_layers
7
+ import mesh_tensorflow.transformer.utils
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 16384
13
+ d_kv = 128
14
+ d_model = 1024
15
+ dropout_rate = 0.0
16
+ inputs_length = 512
17
+ mean_noise_span_length = 3.0
18
+ MIXTURE_NAME = 'c4_v220_unsupervised'
19
+ noise_density = 0.15
20
+ num_heads = 32
21
+ num_layers = 24
22
+
23
+ # Parameters for adafactor_decay_rate_pow:
24
+ # ==============================================================================
25
+ adafactor_decay_rate_pow.offset = 0
26
+
27
+ # Parameters for AdafactorOptimizer:
28
+ # ==============================================================================
29
+ AdafactorOptimizer.beta1 = 0.0
30
+ AdafactorOptimizer.clipping_threshold = 1.0
31
+ AdafactorOptimizer.decay_rate = None
32
+ AdafactorOptimizer.epsilon1 = 1e-30
33
+ AdafactorOptimizer.epsilon2 = 0.001
34
+ AdafactorOptimizer.factored = True
35
+ AdafactorOptimizer.min_dim_size_to_factor = 128
36
+ AdafactorOptimizer.multiply_by_parameter_scale = True
37
+
38
+ # Parameters for Bitransformer:
39
+ # ==============================================================================
40
+ Bitransformer.shared_embedding = True
41
+
42
+ # Parameters for denoise:
43
+ # ==============================================================================
44
+ denoise.inputs_fn = @preprocessors.noise_span_to_unique_sentinel
45
+ denoise.noise_density = %noise_density
46
+ denoise.noise_mask_fn = @preprocessors.random_spans_noise_mask
47
+ denoise.targets_fn = @preprocessors.nonnoise_span_to_unique_sentinel
48
+
49
+ # Parameters for decoder/DenseReluDense:
50
+ # ==============================================================================
51
+ decoder/DenseReluDense.activation = 'relu'
52
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
53
+ decoder/DenseReluDense.hidden_size = %d_ff
54
+ decoder/DenseReluDense.use_bias = False
55
+
56
+ # Parameters for encoder/DenseReluDense:
57
+ # ==============================================================================
58
+ encoder/DenseReluDense.activation = 'relu'
59
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
60
+ encoder/DenseReluDense.hidden_size = %d_ff
61
+ encoder/DenseReluDense.use_bias = False
62
+
63
+ # Parameters for enc_dec_attention:
64
+ # ==============================================================================
65
+ # None.
66
+
67
+ # Parameters for enc_dec_attention_bias:
68
+ # ==============================================================================
69
+ # None.
70
+
71
+ # Parameters for decoder/EncDecAttention:
72
+ # ==============================================================================
73
+ decoder/EncDecAttention.relative_attention_type = None
74
+
75
+ # Parameters for get_variable_dtype:
76
+ # ==============================================================================
77
+ get_variable_dtype.activation_dtype = 'bfloat16'
78
+
79
+ # Parameters for get_vocab_embedding_cls:
80
+ # ==============================================================================
81
+ # None.
82
+
83
+ # Parameters for get_vocabulary:
84
+ # ==============================================================================
85
+ get_vocabulary.mixture_or_task_name = %MIXTURE_NAME
86
+
87
+ # Parameters for decoder/LayerStack:
88
+ # ==============================================================================
89
+ decoder/LayerStack.dropout_rate = None
90
+ decoder/LayerStack.norm_epsilon = None
91
+ decoder/LayerStack.recompute_grads = False
92
+ decoder/LayerStack.sublayers_final = \
93
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
94
+ decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
95
+ decoder/LayerStack.sublayers_per_layer = \
96
+ [@transformer.sublayer_rms_norm,
97
+ @transformer.sublayer_call_layer,
98
+ @transformer.sublayer_dropout,
99
+ @transformer.sublayer_residual]
100
+
101
+ # Parameters for encoder/LayerStack:
102
+ # ==============================================================================
103
+ encoder/LayerStack.dropout_rate = None
104
+ encoder/LayerStack.norm_epsilon = None
105
+ encoder/LayerStack.recompute_grads = False
106
+ encoder/LayerStack.sublayers_final = \
107
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
108
+ encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
109
+ encoder/LayerStack.sublayers_per_layer = \
110
+ [@transformer.sublayer_rms_norm,
111
+ @transformer.sublayer_call_layer,
112
+ @transformer.sublayer_dropout,
113
+ @transformer.sublayer_residual]
114
+
115
+ # Parameters for learning_rate_schedule_noam:
116
+ # ==============================================================================
117
+ learning_rate_schedule_noam.linear_decay_fraction = 0.0
118
+ learning_rate_schedule_noam.multiplier = 1.0
119
+ learning_rate_schedule_noam.offset = 0
120
+ learning_rate_schedule_noam.warmup_steps = 10000
121
+
122
+ # Parameters for make_bitransformer:
123
+ # ==============================================================================
124
+ make_bitransformer.decoder_name = 'decoder'
125
+ make_bitransformer.encoder_name = 'encoder'
126
+
127
+ # Parameters for decoder/make_layer_stack:
128
+ # ==============================================================================
129
+ decoder/make_layer_stack.block_scope = True
130
+ decoder/make_layer_stack.layers = \
131
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
132
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
133
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
134
+ decoder/make_layer_stack.num_layers = %num_layers
135
+
136
+ # Parameters for encoder/make_layer_stack:
137
+ # ==============================================================================
138
+ encoder/make_layer_stack.block_scope = True
139
+ encoder/make_layer_stack.layers = \
140
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
141
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
142
+ encoder/make_layer_stack.num_layers = %num_layers
143
+
144
+ # Parameters for mesh_train_dataset_fn:
145
+ # ==============================================================================
146
+ mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
147
+ mesh_train_dataset_fn.pack = True
148
+ mesh_train_dataset_fn.seed = None
149
+ mesh_train_dataset_fn.shuffle = True
150
+ mesh_train_dataset_fn.use_cached = 1
151
+
152
+ # Parameters for noise_span_to_unique_sentinel:
153
+ # ==============================================================================
154
+ # None.
155
+
156
+ # Parameters for nonnoise_span_to_unique_sentinel:
157
+ # ==============================================================================
158
+ # None.
159
+
160
+ # Parameters for pack_dataset:
161
+ # ==============================================================================
162
+ pack_dataset.use_custom_ops = True
163
+
164
+ # Parameters for pack_or_pad:
165
+ # ==============================================================================
166
+ # None.
167
+
168
+ # Parameters for random_spans_helper:
169
+ # ==============================================================================
170
+ random_spans_helper.extra_tokens_per_span_inputs = 1
171
+ random_spans_helper.extra_tokens_per_span_targets = 1
172
+ random_spans_helper.inputs_length = %inputs_length
173
+ random_spans_helper.mean_noise_span_length = %mean_noise_span_length
174
+ random_spans_helper.noise_density = %noise_density
175
+ random_spans_helper.verbose = False
176
+
177
+ # Parameters for random_spans_noise_mask:
178
+ # ==============================================================================
179
+ random_spans_noise_mask.mean_noise_span_length = %mean_noise_span_length
180
+
181
+ # Parameters for random_spans_tokens_length:
182
+ # ==============================================================================
183
+ # None.
184
+
185
+ # Parameters for reduce_concat_tokens:
186
+ # ==============================================================================
187
+ reduce_concat_tokens.batch_size = 128
188
+ reduce_concat_tokens.feature_key = 'targets'
189
+
190
+ # Parameters for rewrite_stack_variables:
191
+ # ==============================================================================
192
+ rewrite_stack_variables.max_combined_variable_size = 536870912
193
+
194
+ # Parameters for run:
195
+ # ==============================================================================
196
+ run.autostack = True
197
+ run.batch_size = ('tokens_per_batch', 65536)
198
+ run.checkpoint_input_pipeline = False
199
+ run.dataset_split = 'train'
200
+ run.ensemble_inputs = None
201
+ run.eval_checkpoint_step = None
202
+ run.eval_dataset_fn = None
203
+ run.eval_summary_dir = None
204
+ run.export_checkpoint_step = None
205
+ run.export_path = ''
206
+ run.init_checkpoint = None
207
+ run.iterations_per_loop = 100
208
+ run.keep_checkpoint_max = None
209
+ run.layout_rules = \
210
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
211
+ run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam
212
+ run.mesh_devices = None
213
+ run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
214
+ run.mode = 'train'
215
+ run.model_type = 'bitransformer'
216
+ run.optimizer = @optimize.AdafactorOptimizer
217
+ run.output_eval_examples = True
218
+ run.perplexity_eval_steps = 100
219
+ run.predict_fn = None
220
+ run.save_checkpoints_steps = 5000
221
+ run.seen_data_init_step = 0
222
+ run.sequence_length = {'inputs': 512, 'targets': 128}
223
+ run.skip_seen_data = False
224
+ run.total_run_steps = None
225
+ run.train_dataset_fn = @t5.models.mesh_transformer.mesh_train_dataset_fn
226
+ run.train_steps = 524288
227
+ run.variable_filter = None
228
+
229
+ # Parameters for select_random_chunk:
230
+ # ==============================================================================
231
+ select_random_chunk.additional_feature_keys = None
232
+ select_random_chunk.additional_passthrough_keys = None
233
+ select_random_chunk.feature_key = 'targets'
234
+ select_random_chunk.max_length = 65536
235
+ select_random_chunk.uniform_random_start = False
236
+
237
+ # Parameters for decoder/SelfAttention:
238
+ # ==============================================================================
239
+ decoder/SelfAttention.attention_func = None
240
+ decoder/SelfAttention.attention_kwargs = None
241
+ decoder/SelfAttention.combine_dims = True
242
+ decoder/SelfAttention.dropout_rate = %dropout_rate
243
+ decoder/SelfAttention.fold_scaling_into_initializer = True
244
+ decoder/SelfAttention.keep_query_heads_dims = False
245
+ decoder/SelfAttention.key_value_size = %d_kv
246
+ decoder/SelfAttention.num_heads = %num_heads
247
+ decoder/SelfAttention.num_memory_heads = 0
248
+ decoder/SelfAttention.relative_attention_num_buckets = 32
249
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
250
+ decoder/SelfAttention.shared_kv = True
251
+
252
+ # Parameters for encoder/SelfAttention:
253
+ # ==============================================================================
254
+ encoder/SelfAttention.attention_func = None
255
+ encoder/SelfAttention.attention_kwargs = None
256
+ encoder/SelfAttention.combine_dims = True
257
+ encoder/SelfAttention.dropout_rate = %dropout_rate
258
+ encoder/SelfAttention.fold_scaling_into_initializer = True
259
+ encoder/SelfAttention.keep_query_heads_dims = False
260
+ encoder/SelfAttention.key_value_size = %d_kv
261
+ encoder/SelfAttention.num_heads = %num_heads
262
+ encoder/SelfAttention.num_memory_heads = 0
263
+ encoder/SelfAttention.relative_attention_num_buckets = 32
264
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
265
+ encoder/SelfAttention.shared_kv = True
266
+
267
+ # Parameters for serialize_num_microbatches:
268
+ # ==============================================================================
269
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
270
+
271
+ # Parameters for SimdMeshImpl:
272
+ # ==============================================================================
273
+ SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8
274
+
275
+ # Parameters for split_tokens:
276
+ # ==============================================================================
277
+ split_tokens.additional_feature_keys = None
278
+ split_tokens.feature_key = 'targets'
279
+ split_tokens.max_tokens_per_segment = @preprocessors.random_spans_tokens_length()
280
+ split_tokens.min_tokens_per_segment = None
281
+ split_tokens.passthrough_feature_keys = None
282
+
283
+ # Parameters for sublayer_call_layer:
284
+ # ==============================================================================
285
+ # None.
286
+
287
+ # Parameters for sublayer_dropout:
288
+ # ==============================================================================
289
+ sublayer_dropout.dropout_rate = %dropout_rate
290
+
291
+ # Parameters for sublayer_mask_padding:
292
+ # ==============================================================================
293
+ # None.
294
+
295
+ # Parameters for sublayer_residual:
296
+ # ==============================================================================
297
+ # None.
298
+
299
+ # Parameters for sublayer_rms_norm:
300
+ # ==============================================================================
301
+ sublayer_rms_norm.epsilon = 1e-06
302
+ sublayer_rms_norm.name = 'rms_norm'
303
+
304
+ # Parameters for tpu_estimator_model_fn:
305
+ # ==============================================================================
306
+ tpu_estimator_model_fn.hierarchical_tiling_spec = None
307
+ tpu_estimator_model_fn.init_variable_filter = ''
308
+ tpu_estimator_model_fn.model_info_file = ''
309
+ tpu_estimator_model_fn.outer_batch_size = 1
310
+ tpu_estimator_model_fn.tpu_summaries = False
311
+
312
+ # Parameters for tpu_mesh_shape:
313
+ # ==============================================================================
314
+ tpu_mesh_shape.ensemble_parallelism = None
315
+ tpu_mesh_shape.model_parallelism = 8
316
+ tpu_mesh_shape.tpu_topology = '4x4'
317
+
318
+ # Parameters for unit_scaling_convention:
319
+ # ==============================================================================
320
+ unit_scaling_convention.value = False
321
+
322
+ # Parameters for decoder/Unitransformer:
323
+ # ==============================================================================
324
+ decoder/Unitransformer.d_model = %d_model
325
+ decoder/Unitransformer.ensemble = None
326
+ decoder/Unitransformer.input_full_attention = False
327
+ decoder/Unitransformer.label_smoothing = 0.0
328
+ decoder/Unitransformer.loss_denominator = None
329
+ decoder/Unitransformer.loss_fn = None
330
+ decoder/Unitransformer.loss_on_targets_only = False
331
+ decoder/Unitransformer.max_length = 512
332
+ decoder/Unitransformer.positional_embedding = False
333
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
334
+ decoder/Unitransformer.sinusoid_positional_embedding = False
335
+ decoder/Unitransformer.token_dropout_rate = 0.0
336
+ decoder/Unitransformer.vocab_divisor = 128
337
+ decoder/Unitransformer.z_loss = 0.0001
338
+
339
+ # Parameters for encoder/Unitransformer:
340
+ # ==============================================================================
341
+ encoder/Unitransformer.d_model = %d_model
342
+ encoder/Unitransformer.ensemble = None
343
+ encoder/Unitransformer.input_full_attention = False
344
+ encoder/Unitransformer.label_smoothing = 0.0
345
+ encoder/Unitransformer.loss_denominator = None
346
+ encoder/Unitransformer.loss_fn = None
347
+ encoder/Unitransformer.loss_on_targets_only = False
348
+ encoder/Unitransformer.max_length = 512
349
+ encoder/Unitransformer.positional_embedding = False
350
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
351
+ encoder/Unitransformer.sinusoid_positional_embedding = False
352
+ encoder/Unitransformer.token_dropout_rate = 0.0
353
+ encoder/Unitransformer.vocab_divisor = 128
354
+ encoder/Unitransformer.z_loss = 0.0001
355
+
356
+ # Parameters for unsupervised:
357
+ # ==============================================================================
358
+ unsupervised.preprocessors = \
359
+ [@preprocessors.select_random_chunk,
360
+ @preprocessors.reduce_concat_tokens,
361
+ @preprocessors.split_tokens,
362
+ @preprocessors.denoise]
363
+
364
+ # Parameters for VarianceScalingInitializer:
365
+ # ==============================================================================
366
+ VarianceScalingInitializer.distribution = 'normal'
367
+ VarianceScalingInitializer.mode = 'fan_in'
368
+ VarianceScalingInitializer.scale = 1.0
369
+
370
+ # Parameters for VocabEmbedding:
371
+ # ==============================================================================
372
+ VocabEmbedding.scale_variable_like_classifier_weights = False