patrickvonplaten commited on
Commit
3c72028
1 Parent(s): 10c17ad

Add config

Browse files
Files changed (2) hide show
  1. config.json +26 -0
  2. operative_config.gin +370 -0
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 4096,
7
+ "d_kv": 64,
8
+ "d_model": 1024,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "relu",
13
+ "initializer_factor": 1.0,
14
+ "is_encoder_decoder": true,
15
+ "layer_norm_epsilon": 1e-06,
16
+ "model_type": "t5",
17
+ "n_positions": 512,
18
+ "num_decoder_layers": 24,
19
+ "num_heads": 16,
20
+ "num_layers": 24,
21
+ "pad_token_id": 0,
22
+ "relative_attention_num_buckets": 32,
23
+ "transformers_version": "4.17.0.dev0",
24
+ "use_cache": true,
25
+ "vocab_size": 32128
26
+ }
operative_config.gin ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer
6
+ import mesh_tensorflow.transformer.transformer_layers
7
+ import mesh_tensorflow.transformer.utils
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 4096
13
+ d_kv = 64
14
+ d_model = 1024
15
+ dropout_rate = 0.0
16
+ inputs_length = 512
17
+ mean_noise_span_length = 3.0
18
+ MIXTURE_NAME = 'c4_v220_unsupervised'
19
+ noise_density = 0.15
20
+ num_heads = 16
21
+ num_layers = 24
22
+
23
+ # Parameters for adafactor_decay_rate_pow:
24
+ # ==============================================================================
25
+ adafactor_decay_rate_pow.offset = 0
26
+
27
+ # Parameters for AdafactorOptimizer:
28
+ # ==============================================================================
29
+ AdafactorOptimizer.beta1 = 0.0
30
+ AdafactorOptimizer.clipping_threshold = 1.0
31
+ AdafactorOptimizer.decay_rate = None
32
+ AdafactorOptimizer.epsilon1 = 1e-30
33
+ AdafactorOptimizer.epsilon2 = 0.001
34
+ AdafactorOptimizer.factored = True
35
+ AdafactorOptimizer.min_dim_size_to_factor = 128
36
+ AdafactorOptimizer.multiply_by_parameter_scale = True
37
+
38
+ # Parameters for Bitransformer:
39
+ # ==============================================================================
40
+ Bitransformer.shared_embedding = True
41
+
42
+ # Parameters for denoise:
43
+ # ==============================================================================
44
+ denoise.inputs_fn = @preprocessors.noise_span_to_unique_sentinel
45
+ denoise.noise_density = %noise_density
46
+ denoise.noise_mask_fn = @preprocessors.random_spans_noise_mask
47
+ denoise.targets_fn = @preprocessors.nonnoise_span_to_unique_sentinel
48
+
49
+ # Parameters for decoder/DenseReluDense:
50
+ # ==============================================================================
51
+ decoder/DenseReluDense.activation = 'relu'
52
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
53
+ decoder/DenseReluDense.hidden_size = %d_ff
54
+ decoder/DenseReluDense.use_bias = False
55
+
56
+ # Parameters for encoder/DenseReluDense:
57
+ # ==============================================================================
58
+ encoder/DenseReluDense.activation = 'relu'
59
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
60
+ encoder/DenseReluDense.hidden_size = %d_ff
61
+ encoder/DenseReluDense.use_bias = False
62
+
63
+ # Parameters for enc_dec_attention:
64
+ # ==============================================================================
65
+ # None.
66
+
67
+ # Parameters for enc_dec_attention_bias:
68
+ # ==============================================================================
69
+ # None.
70
+
71
+ # Parameters for decoder/EncDecAttention:
72
+ # ==============================================================================
73
+ decoder/EncDecAttention.relative_attention_type = None
74
+
75
+ # Parameters for get_variable_dtype:
76
+ # ==============================================================================
77
+ get_variable_dtype.activation_dtype = 'bfloat16'
78
+
79
+ # Parameters for get_vocab_embedding_cls:
80
+ # ==============================================================================
81
+ # None.
82
+
83
+ # Parameters for get_vocabulary:
84
+ # ==============================================================================
85
+ get_vocabulary.mixture_or_task_name = %MIXTURE_NAME
86
+
87
+ # Parameters for decoder/LayerStack:
88
+ # ==============================================================================
89
+ decoder/LayerStack.dropout_rate = None
90
+ decoder/LayerStack.norm_epsilon = None
91
+ decoder/LayerStack.recompute_grads = False
92
+ decoder/LayerStack.sublayers_final = \
93
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
94
+ decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
95
+ decoder/LayerStack.sublayers_per_layer = \
96
+ [@transformer.sublayer_rms_norm,
97
+ @transformer.sublayer_call_layer,
98
+ @transformer.sublayer_dropout,
99
+ @transformer.sublayer_residual]
100
+
101
+ # Parameters for encoder/LayerStack:
102
+ # ==============================================================================
103
+ encoder/LayerStack.dropout_rate = None
104
+ encoder/LayerStack.norm_epsilon = None
105
+ encoder/LayerStack.recompute_grads = False
106
+ encoder/LayerStack.sublayers_final = \
107
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
108
+ encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
109
+ encoder/LayerStack.sublayers_per_layer = \
110
+ [@transformer.sublayer_rms_norm,
111
+ @transformer.sublayer_call_layer,
112
+ @transformer.sublayer_dropout,
113
+ @transformer.sublayer_residual]
114
+
115
+ # Parameters for learning_rate_schedule_noam:
116
+ # ==============================================================================
117
+ learning_rate_schedule_noam.linear_decay_fraction = 0.0
118
+ learning_rate_schedule_noam.multiplier = 1.0
119
+ learning_rate_schedule_noam.offset = 0
120
+ learning_rate_schedule_noam.warmup_steps = 10000
121
+
122
+ # Parameters for make_bitransformer:
123
+ # ==============================================================================
124
+ make_bitransformer.decoder_name = 'decoder'
125
+ make_bitransformer.encoder_name = 'encoder'
126
+
127
+ # Parameters for decoder/make_layer_stack:
128
+ # ==============================================================================
129
+ decoder/make_layer_stack.block_scope = True
130
+ decoder/make_layer_stack.layers = \
131
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
132
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
133
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
134
+ decoder/make_layer_stack.num_layers = %num_layers
135
+
136
+ # Parameters for encoder/make_layer_stack:
137
+ # ==============================================================================
138
+ encoder/make_layer_stack.block_scope = True
139
+ encoder/make_layer_stack.layers = \
140
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
141
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
142
+ encoder/make_layer_stack.num_layers = %num_layers
143
+
144
+ # Parameters for mesh_train_dataset_fn:
145
+ # ==============================================================================
146
+ mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
147
+ mesh_train_dataset_fn.pack = True
148
+ mesh_train_dataset_fn.seed = None
149
+ mesh_train_dataset_fn.use_cached = 1
150
+
151
+ # Parameters for noise_span_to_unique_sentinel:
152
+ # ==============================================================================
153
+ # None.
154
+
155
+ # Parameters for nonnoise_span_to_unique_sentinel:
156
+ # ==============================================================================
157
+ # None.
158
+
159
+ # Parameters for pack_dataset:
160
+ # ==============================================================================
161
+ pack_dataset.use_custom_ops = True
162
+
163
+ # Parameters for pack_or_pad:
164
+ # ==============================================================================
165
+ # None.
166
+
167
+ # Parameters for random_spans_helper:
168
+ # ==============================================================================
169
+ random_spans_helper.extra_tokens_per_span_inputs = 1
170
+ random_spans_helper.extra_tokens_per_span_targets = 1
171
+ random_spans_helper.inputs_length = %inputs_length
172
+ random_spans_helper.mean_noise_span_length = %mean_noise_span_length
173
+ random_spans_helper.noise_density = %noise_density
174
+ random_spans_helper.verbose = False
175
+
176
+ # Parameters for random_spans_noise_mask:
177
+ # ==============================================================================
178
+ random_spans_noise_mask.mean_noise_span_length = %mean_noise_span_length
179
+
180
+ # Parameters for random_spans_tokens_length:
181
+ # ==============================================================================
182
+ # None.
183
+
184
+ # Parameters for reduce_concat_tokens:
185
+ # ==============================================================================
186
+ reduce_concat_tokens.batch_size = 128
187
+ reduce_concat_tokens.feature_key = 'targets'
188
+
189
+ # Parameters for rewrite_stack_variables:
190
+ # ==============================================================================
191
+ rewrite_stack_variables.max_combined_variable_size = 536870912
192
+
193
+ # Parameters for run:
194
+ # ==============================================================================
195
+ run.autostack = True
196
+ run.batch_size = ('tokens_per_batch', 65536)
197
+ run.dataset_split = 'train'
198
+ run.ensemble_inputs = None
199
+ run.eval_checkpoint_step = None
200
+ run.eval_dataset_fn = None
201
+ run.eval_summary_dir = None
202
+ run.export_checkpoint_step = None
203
+ run.export_path = ''
204
+ run.init_checkpoint = None
205
+ run.iterations_per_loop = 100
206
+ run.keep_checkpoint_max = None
207
+ run.layout_rules = \
208
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
209
+ run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam
210
+ run.mesh_devices = None
211
+ run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
212
+ run.mode = 'train'
213
+ run.model_type = 'bitransformer'
214
+ run.optimizer = @optimize.AdafactorOptimizer
215
+ run.output_eval_examples = True
216
+ run.perplexity_eval_steps = 100
217
+ run.predict_fn = None
218
+ run.save_checkpoints_steps = 10000
219
+ run.seen_data_init_step = 0
220
+ run.sequence_length = {'inputs': 512, 'targets': 128}
221
+ run.skip_seen_data = False
222
+ run.total_run_steps = None
223
+ run.train_dataset_fn = @t5.models.mesh_transformer.mesh_train_dataset_fn
224
+ run.train_steps = 524288
225
+ run.variable_filter = None
226
+
227
+ # Parameters for select_random_chunk:
228
+ # ==============================================================================
229
+ select_random_chunk.additional_feature_keys = None
230
+ select_random_chunk.additional_passthrough_keys = None
231
+ select_random_chunk.feature_key = 'targets'
232
+ select_random_chunk.max_length = 65536
233
+ select_random_chunk.uniform_random_start = False
234
+
235
+ # Parameters for decoder/SelfAttention:
236
+ # ==============================================================================
237
+ decoder/SelfAttention.attention_func = None
238
+ decoder/SelfAttention.attention_kwargs = None
239
+ decoder/SelfAttention.combine_dims = True
240
+ decoder/SelfAttention.dropout_rate = %dropout_rate
241
+ decoder/SelfAttention.fold_scaling_into_initializer = True
242
+ decoder/SelfAttention.keep_query_heads_dims = False
243
+ decoder/SelfAttention.key_value_size = %d_kv
244
+ decoder/SelfAttention.num_heads = %num_heads
245
+ decoder/SelfAttention.num_memory_heads = 0
246
+ decoder/SelfAttention.relative_attention_num_buckets = 32
247
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
248
+ decoder/SelfAttention.shared_kv = False
249
+
250
+ # Parameters for encoder/SelfAttention:
251
+ # ==============================================================================
252
+ encoder/SelfAttention.attention_func = None
253
+ encoder/SelfAttention.attention_kwargs = None
254
+ encoder/SelfAttention.combine_dims = True
255
+ encoder/SelfAttention.dropout_rate = %dropout_rate
256
+ encoder/SelfAttention.fold_scaling_into_initializer = True
257
+ encoder/SelfAttention.keep_query_heads_dims = False
258
+ encoder/SelfAttention.key_value_size = %d_kv
259
+ encoder/SelfAttention.num_heads = %num_heads
260
+ encoder/SelfAttention.num_memory_heads = 0
261
+ encoder/SelfAttention.relative_attention_num_buckets = 32
262
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
263
+ encoder/SelfAttention.shared_kv = False
264
+
265
+ # Parameters for serialize_num_microbatches:
266
+ # ==============================================================================
267
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
268
+
269
+ # Parameters for SimdMeshImpl:
270
+ # ==============================================================================
271
+ SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8
272
+
273
+ # Parameters for split_tokens:
274
+ # ==============================================================================
275
+ split_tokens.additional_feature_keys = None
276
+ split_tokens.feature_key = 'targets'
277
+ split_tokens.max_tokens_per_segment = @preprocessors.random_spans_tokens_length()
278
+ split_tokens.min_tokens_per_segment = None
279
+ split_tokens.passthrough_feature_keys = None
280
+
281
+ # Parameters for sublayer_call_layer:
282
+ # ==============================================================================
283
+ # None.
284
+
285
+ # Parameters for sublayer_dropout:
286
+ # ==============================================================================
287
+ sublayer_dropout.dropout_rate = %dropout_rate
288
+
289
+ # Parameters for sublayer_mask_padding:
290
+ # ==============================================================================
291
+ # None.
292
+
293
+ # Parameters for sublayer_residual:
294
+ # ==============================================================================
295
+ # None.
296
+
297
+ # Parameters for sublayer_rms_norm:
298
+ # ==============================================================================
299
+ sublayer_rms_norm.epsilon = 1e-06
300
+ sublayer_rms_norm.name = 'rms_norm'
301
+
302
+ # Parameters for tpu_estimator_model_fn:
303
+ # ==============================================================================
304
+ tpu_estimator_model_fn.hierarchical_tiling_spec = None
305
+ tpu_estimator_model_fn.init_variable_filter = ''
306
+ tpu_estimator_model_fn.model_info_file = ''
307
+ tpu_estimator_model_fn.outer_batch_size = 1
308
+ tpu_estimator_model_fn.tpu_summaries = False
309
+
310
+ # Parameters for tpu_mesh_shape:
311
+ # ==============================================================================
312
+ tpu_mesh_shape.ensemble_parallelism = None
313
+ tpu_mesh_shape.model_parallelism = 2
314
+ tpu_mesh_shape.tpu_topology = '4x4'
315
+
316
+ # Parameters for unit_scaling_convention:
317
+ # ==============================================================================
318
+ unit_scaling_convention.value = False
319
+
320
+ # Parameters for decoder/Unitransformer:
321
+ # ==============================================================================
322
+ decoder/Unitransformer.d_model = %d_model
323
+ decoder/Unitransformer.ensemble = None
324
+ decoder/Unitransformer.input_full_attention = False
325
+ decoder/Unitransformer.label_smoothing = 0.0
326
+ decoder/Unitransformer.loss_denominator = None
327
+ decoder/Unitransformer.loss_fn = None
328
+ decoder/Unitransformer.loss_on_targets_only = False
329
+ decoder/Unitransformer.max_length = 512
330
+ decoder/Unitransformer.positional_embedding = False
331
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
332
+ decoder/Unitransformer.sinusoid_positional_embedding = False
333
+ decoder/Unitransformer.token_dropout_rate = 0.0
334
+ decoder/Unitransformer.vocab_divisor = 128
335
+ decoder/Unitransformer.z_loss = 0.0001
336
+
337
+ # Parameters for encoder/Unitransformer:
338
+ # ==============================================================================
339
+ encoder/Unitransformer.d_model = %d_model
340
+ encoder/Unitransformer.ensemble = None
341
+ encoder/Unitransformer.input_full_attention = False
342
+ encoder/Unitransformer.label_smoothing = 0.0
343
+ encoder/Unitransformer.loss_denominator = None
344
+ encoder/Unitransformer.loss_fn = None
345
+ encoder/Unitransformer.loss_on_targets_only = False
346
+ encoder/Unitransformer.max_length = 512
347
+ encoder/Unitransformer.positional_embedding = False
348
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
349
+ encoder/Unitransformer.sinusoid_positional_embedding = False
350
+ encoder/Unitransformer.token_dropout_rate = 0.0
351
+ encoder/Unitransformer.vocab_divisor = 128
352
+ encoder/Unitransformer.z_loss = 0.0001
353
+
354
+ # Parameters for unsupervised:
355
+ # ==============================================================================
356
+ unsupervised.preprocessors = \
357
+ [@preprocessors.select_random_chunk,
358
+ @preprocessors.reduce_concat_tokens,
359
+ @preprocessors.split_tokens,
360
+ @preprocessors.denoise]
361
+
362
+ # Parameters for VarianceScalingInitializer:
363
+ # ==============================================================================
364
+ VarianceScalingInitializer.distribution = 'normal'
365
+ VarianceScalingInitializer.mode = 'fan_in'
366
+ VarianceScalingInitializer.scale = 1.0
367
+
368
+ # Parameters for VocabEmbedding:
369
+ # ==============================================================================
370
+ VocabEmbedding.scale_variable_like_classifier_weights = False