patrickvonplaten commited on
Commit
276b12e
1 Parent(s): 8ff95af

Add config

Browse files
Files changed (2) hide show
  1. config.json +26 -0
  2. operative_config.gin +375 -0
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 65536,
7
+ "d_kv": 64,
8
+ "d_model": 1024,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "relu",
13
+ "initializer_factor": 1.0,
14
+ "is_encoder_decoder": true,
15
+ "layer_norm_epsilon": 1e-06,
16
+ "model_type": "t5",
17
+ "n_positions": 512,
18
+ "num_decoder_layers": 4,
19
+ "num_heads": 32,
20
+ "num_layers": 4,
21
+ "pad_token_id": 0,
22
+ "relative_attention_num_buckets": 32,
23
+ "transformers_version": "4.17.0.dev0",
24
+ "use_cache": true,
25
+ "vocab_size": 32128
26
+ }
operative_config.gin ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer
6
+ import mesh_tensorflow.transformer.transformer_layers
7
+ import mesh_tensorflow.transformer.utils
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 65536
13
+ d_kv = 64
14
+ d_model = 1024
15
+ dropout_rate = 0.0
16
+ inputs_length = 512
17
+ mean_noise_span_length = 3.0
18
+ MIXTURE_NAME = 'c4_v220_unsupervised'
19
+ noise_density = 0.15
20
+ num_heads = 32
21
+ num_layers = 4
22
+
23
+ # Parameters for adafactor_decay_rate_pow:
24
+ # ==============================================================================
25
+ adafactor_decay_rate_pow.offset = 0
26
+
27
+ # Parameters for AdafactorOptimizer:
28
+ # ==============================================================================
29
+ AdafactorOptimizer.beta1 = 0.0
30
+ AdafactorOptimizer.clipping_threshold = 1.0
31
+ AdafactorOptimizer.decay_rate = None
32
+ AdafactorOptimizer.epsilon1 = 1e-30
33
+ AdafactorOptimizer.epsilon2 = 0.001
34
+ AdafactorOptimizer.factored = True
35
+ AdafactorOptimizer.min_dim_size_to_factor = 128
36
+ AdafactorOptimizer.multiply_by_parameter_scale = True
37
+
38
+ # Parameters for Bitransformer:
39
+ # ==============================================================================
40
+ Bitransformer.shared_embedding = True
41
+
42
+ # Parameters for denoise:
43
+ # ==============================================================================
44
+ denoise.inputs_fn = @preprocessors.noise_span_to_unique_sentinel
45
+ denoise.noise_density = %noise_density
46
+ denoise.noise_mask_fn = @preprocessors.random_spans_noise_mask
47
+ denoise.passthrough_feature_keys = None
48
+ denoise.targets_fn = @preprocessors.nonnoise_span_to_unique_sentinel
49
+
50
+ # Parameters for decoder/DenseReluDense:
51
+ # ==============================================================================
52
+ decoder/DenseReluDense.activation = 'relu'
53
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
54
+ decoder/DenseReluDense.hidden_size = %d_ff
55
+ decoder/DenseReluDense.use_bias = False
56
+
57
+ # Parameters for encoder/DenseReluDense:
58
+ # ==============================================================================
59
+ encoder/DenseReluDense.activation = 'relu'
60
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
61
+ encoder/DenseReluDense.hidden_size = %d_ff
62
+ encoder/DenseReluDense.use_bias = False
63
+
64
+ # Parameters for enc_dec_attention:
65
+ # ==============================================================================
66
+ # None.
67
+
68
+ # Parameters for enc_dec_attention_bias:
69
+ # ==============================================================================
70
+ # None.
71
+
72
+ # Parameters for decoder/EncDecAttention:
73
+ # ==============================================================================
74
+ decoder/EncDecAttention.relative_attention_type = None
75
+
76
+ # Parameters for get_variable_dtype:
77
+ # ==============================================================================
78
+ get_variable_dtype.activation_dtype = 'bfloat16'
79
+
80
+ # Parameters for get_vocab_embedding_cls:
81
+ # ==============================================================================
82
+ # None.
83
+
84
+ # Parameters for get_vocabulary:
85
+ # ==============================================================================
86
+ get_vocabulary.mixture_or_task_name = %MIXTURE_NAME
87
+
88
+ # Parameters for decoder/LayerStack:
89
+ # ==============================================================================
90
+ decoder/LayerStack.dropout_rate = None
91
+ decoder/LayerStack.norm_epsilon = None
92
+ decoder/LayerStack.recompute_grads = False
93
+ decoder/LayerStack.sublayers_final = \
94
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
95
+ decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
96
+ decoder/LayerStack.sublayers_per_layer = \
97
+ [@transformer.sublayer_rms_norm,
98
+ @transformer.sublayer_call_layer,
99
+ @transformer.sublayer_dropout,
100
+ @transformer.sublayer_residual]
101
+
102
+ # Parameters for encoder/LayerStack:
103
+ # ==============================================================================
104
+ encoder/LayerStack.dropout_rate = None
105
+ encoder/LayerStack.norm_epsilon = None
106
+ encoder/LayerStack.recompute_grads = False
107
+ encoder/LayerStack.sublayers_final = \
108
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
109
+ encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
110
+ encoder/LayerStack.sublayers_per_layer = \
111
+ [@transformer.sublayer_rms_norm,
112
+ @transformer.sublayer_call_layer,
113
+ @transformer.sublayer_dropout,
114
+ @transformer.sublayer_residual]
115
+
116
+ # Parameters for learning_rate_schedule_noam:
117
+ # ==============================================================================
118
+ learning_rate_schedule_noam.linear_decay_fraction = 0.0
119
+ learning_rate_schedule_noam.multiplier = 1.0
120
+ learning_rate_schedule_noam.offset = 0
121
+ learning_rate_schedule_noam.warmup_steps = 10000
122
+
123
+ # Parameters for make_bitransformer:
124
+ # ==============================================================================
125
+ make_bitransformer.decoder_name = 'decoder'
126
+ make_bitransformer.encoder_name = 'encoder'
127
+
128
+ # Parameters for decoder/make_layer_stack:
129
+ # ==============================================================================
130
+ decoder/make_layer_stack.block_scope = True
131
+ decoder/make_layer_stack.layers = \
132
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
133
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
134
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
135
+ decoder/make_layer_stack.num_layers = %num_layers
136
+
137
+ # Parameters for encoder/make_layer_stack:
138
+ # ==============================================================================
139
+ encoder/make_layer_stack.block_scope = True
140
+ encoder/make_layer_stack.layers = \
141
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
142
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
143
+ encoder/make_layer_stack.num_layers = %num_layers
144
+
145
+ # Parameters for mesh_train_dataset_fn:
146
+ # ==============================================================================
147
+ mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
148
+ mesh_train_dataset_fn.pack = True
149
+ mesh_train_dataset_fn.seed = None
150
+ mesh_train_dataset_fn.shuffle = True
151
+ mesh_train_dataset_fn.use_cached = 1
152
+
153
+ # Parameters for noise_span_to_unique_sentinel:
154
+ # ==============================================================================
155
+ # None.
156
+
157
+ # Parameters for nonnoise_span_to_unique_sentinel:
158
+ # ==============================================================================
159
+ # None.
160
+
161
+ # Parameters for pack_dataset:
162
+ # ==============================================================================
163
+ pack_dataset.use_custom_ops = True
164
+
165
+ # Parameters for pack_or_pad:
166
+ # ==============================================================================
167
+ # None.
168
+
169
+ # Parameters for random_spans_helper:
170
+ # ==============================================================================
171
+ random_spans_helper.extra_tokens_per_span_inputs = 1
172
+ random_spans_helper.extra_tokens_per_span_targets = 1
173
+ random_spans_helper.inputs_length = %inputs_length
174
+ random_spans_helper.mean_noise_span_length = %mean_noise_span_length
175
+ random_spans_helper.noise_density = %noise_density
176
+ random_spans_helper.verbose = False
177
+
178
+ # Parameters for random_spans_noise_mask:
179
+ # ==============================================================================
180
+ random_spans_noise_mask.mean_noise_span_length = %mean_noise_span_length
181
+
182
+ # Parameters for random_spans_tokens_length:
183
+ # ==============================================================================
184
+ # None.
185
+
186
+ # Parameters for reduce_concat_tokens:
187
+ # ==============================================================================
188
+ reduce_concat_tokens.batch_size = 128
189
+ reduce_concat_tokens.feature_key = 'targets'
190
+
191
+ # Parameters for rewrite_stack_variables:
192
+ # ==============================================================================
193
+ rewrite_stack_variables.max_combined_variable_size = 536870912
194
+
195
+ # Parameters for run:
196
+ # ==============================================================================
197
+ run.autostack = True
198
+ run.batch_size = ('tokens_per_batch', 65536)
199
+ run.checkpoint_input_pipeline = False
200
+ run.dataset_split = 'train'
201
+ run.ensemble_inputs = None
202
+ run.eval_checkpoint_step = None
203
+ run.eval_dataset_fn = None
204
+ run.eval_summary_dir = None
205
+ run.export_checkpoint_step = None
206
+ run.export_path = ''
207
+ run.init_checkpoint = None
208
+ run.iterations_per_loop = 100
209
+ run.keep_checkpoint_max = None
210
+ run.layout_rules = \
211
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
212
+ run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam
213
+ run.mesh_devices = None
214
+ run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
215
+ run.mode = 'train'
216
+ run.model_type = 'bitransformer'
217
+ run.optimizer = @optimize.AdafactorOptimizer
218
+ run.output_eval_examples = True
219
+ run.perplexity_eval_steps = 100
220
+ run.predict_fn = None
221
+ run.save_checkpoints_steps = 5000
222
+ run.seen_data_init_step = 0
223
+ run.sequence_length = {'inputs': 512, 'targets': 128}
224
+ run.skip_seen_data = False
225
+ run.total_run_steps = None
226
+ run.train_dataset_fn = @t5.models.mesh_transformer.mesh_train_dataset_fn
227
+ run.train_steps = 524288
228
+ run.variable_filter = None
229
+
230
+ # Parameters for select_random_chunk:
231
+ # ==============================================================================
232
+ select_random_chunk.additional_feature_keys = None
233
+ select_random_chunk.feature_key = 'targets'
234
+ select_random_chunk.max_length = 65536
235
+ select_random_chunk.passthrough_feature_keys = None
236
+ select_random_chunk.uniform_random_start = False
237
+
238
+ # Parameters for decoder/SelfAttention:
239
+ # ==============================================================================
240
+ decoder/SelfAttention.attention_func = None
241
+ decoder/SelfAttention.attention_kwargs = None
242
+ decoder/SelfAttention.combine_dims = True
243
+ decoder/SelfAttention.dropout_rate = %dropout_rate
244
+ decoder/SelfAttention.fold_scaling_into_initializer = True
245
+ decoder/SelfAttention.keep_query_heads_dims = False
246
+ decoder/SelfAttention.key_value_size = %d_kv
247
+ decoder/SelfAttention.num_heads = %num_heads
248
+ decoder/SelfAttention.num_memory_heads = 0
249
+ decoder/SelfAttention.relative_attention_num_buckets = 32
250
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
251
+ decoder/SelfAttention.shared_kv = False
252
+
253
+ # Parameters for encoder/SelfAttention:
254
+ # ==============================================================================
255
+ encoder/SelfAttention.attention_func = None
256
+ encoder/SelfAttention.attention_kwargs = None
257
+ encoder/SelfAttention.combine_dims = True
258
+ encoder/SelfAttention.dropout_rate = %dropout_rate
259
+ encoder/SelfAttention.fold_scaling_into_initializer = True
260
+ encoder/SelfAttention.keep_query_heads_dims = False
261
+ encoder/SelfAttention.key_value_size = %d_kv
262
+ encoder/SelfAttention.num_heads = %num_heads
263
+ encoder/SelfAttention.num_memory_heads = 0
264
+ encoder/SelfAttention.relative_attention_num_buckets = 32
265
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
266
+ encoder/SelfAttention.shared_kv = False
267
+
268
+ # Parameters for serialize_num_microbatches:
269
+ # ==============================================================================
270
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 2048
271
+
272
+ # Parameters for SimdMeshImpl:
273
+ # ==============================================================================
274
+ SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8
275
+
276
+ # Parameters for split_tokens:
277
+ # ==============================================================================
278
+ split_tokens.additional_feature_keys = None
279
+ split_tokens.feature_key = 'targets'
280
+ split_tokens.max_tokens_per_segment = @preprocessors.random_spans_tokens_length()
281
+ split_tokens.min_tokens_per_segment = None
282
+ split_tokens.passthrough_feature_keys = None
283
+
284
+ # Parameters for sublayer_call_layer:
285
+ # ==============================================================================
286
+ # None.
287
+
288
+ # Parameters for sublayer_dropout:
289
+ # ==============================================================================
290
+ sublayer_dropout.dropout_rate = %dropout_rate
291
+
292
+ # Parameters for sublayer_mask_padding:
293
+ # ==============================================================================
294
+ # None.
295
+
296
+ # Parameters for sublayer_residual:
297
+ # ==============================================================================
298
+ # None.
299
+
300
+ # Parameters for sublayer_rms_norm:
301
+ # ==============================================================================
302
+ sublayer_rms_norm.epsilon = 1e-06
303
+ sublayer_rms_norm.name = 'rms_norm'
304
+
305
+ # Parameters for tpu_estimator_model_fn:
306
+ # ==============================================================================
307
+ tpu_estimator_model_fn.hierarchical_tiling_spec = None
308
+ tpu_estimator_model_fn.init_variable_filter = ''
309
+ tpu_estimator_model_fn.model_info_file = ''
310
+ tpu_estimator_model_fn.outer_batch_size = 1
311
+ tpu_estimator_model_fn.tpu_summaries = False
312
+
313
+ # Parameters for tpu_mesh_shape:
314
+ # ==============================================================================
315
+ tpu_mesh_shape.ensemble_parallelism = None
316
+ tpu_mesh_shape.model_parallelism = 32
317
+ tpu_mesh_shape.tpu_topology = '8x8'
318
+
319
+ # Parameters for unit_scaling_convention:
320
+ # ==============================================================================
321
+ unit_scaling_convention.value = False
322
+
323
+ # Parameters for decoder/Unitransformer:
324
+ # ==============================================================================
325
+ decoder/Unitransformer.d_model = %d_model
326
+ decoder/Unitransformer.ensemble = None
327
+ decoder/Unitransformer.input_full_attention = False
328
+ decoder/Unitransformer.label_smoothing = 0.0
329
+ decoder/Unitransformer.loss_denominator = None
330
+ decoder/Unitransformer.loss_fn = None
331
+ decoder/Unitransformer.loss_on_targets_only = False
332
+ decoder/Unitransformer.max_length = 512
333
+ decoder/Unitransformer.positional_embedding = False
334
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
335
+ decoder/Unitransformer.sinusoid_positional_embedding = False
336
+ decoder/Unitransformer.token_dropout_rate = 0.0
337
+ decoder/Unitransformer.vocab_divisor = 128
338
+ decoder/Unitransformer.z_loss = 0.0001
339
+
340
+ # Parameters for encoder/Unitransformer:
341
+ # ==============================================================================
342
+ encoder/Unitransformer.d_model = %d_model
343
+ encoder/Unitransformer.ensemble = None
344
+ encoder/Unitransformer.input_full_attention = False
345
+ encoder/Unitransformer.label_smoothing = 0.0
346
+ encoder/Unitransformer.loss_denominator = None
347
+ encoder/Unitransformer.loss_fn = None
348
+ encoder/Unitransformer.loss_on_targets_only = False
349
+ encoder/Unitransformer.max_length = 512
350
+ encoder/Unitransformer.positional_embedding = False
351
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
352
+ encoder/Unitransformer.sinusoid_positional_embedding = False
353
+ encoder/Unitransformer.token_dropout_rate = 0.0
354
+ encoder/Unitransformer.vocab_divisor = 128
355
+ encoder/Unitransformer.z_loss = 0.0001
356
+
357
+ # Parameters for unsupervised:
358
+ # ==============================================================================
359
+ unsupervised.preprocessors = \
360
+ [@preprocessors.select_random_chunk,
361
+ @preprocessors.reduce_concat_tokens,
362
+ @preprocessors.split_tokens,
363
+ @preprocessors.denoise]
364
+
365
+ # Parameters for VarianceScalingInitializer:
366
+ # ==============================================================================
367
+ VarianceScalingInitializer.distribution = 'normal'
368
+ VarianceScalingInitializer.mode = 'fan_in'
369
+ VarianceScalingInitializer.scale = 1.0
370
+
371
+ # Parameters for VocabEmbedding:
372
+ # ==============================================================================
373
+ VocabEmbedding.efficient_embeddings = False
374
+ VocabEmbedding.efficient_factor = 4
375
+ VocabEmbedding.scale_variable_like_classifier_weights = False