patrickvonplaten commited on
Commit
8f674ab
1 Parent(s): 4b77de2

Add config

Browse files
Files changed (2) hide show
  1. config.json +26 -0
  2. operative_config.gin +371 -0
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "relu",
13
+ "initializer_factor": 1.0,
14
+ "is_encoder_decoder": true,
15
+ "layer_norm_epsilon": 1e-06,
16
+ "model_type": "t5",
17
+ "n_positions": 512,
18
+ "num_decoder_layers": 2,
19
+ "num_heads": 8,
20
+ "num_layers": 8,
21
+ "pad_token_id": 0,
22
+ "relative_attention_num_buckets": 32,
23
+ "transformers_version": "4.17.0.dev0",
24
+ "use_cache": true,
25
+ "vocab_size": 32128
26
+ }
operative_config.gin ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer
6
+ import mesh_tensorflow.transformer.transformer_layers
7
+ import mesh_tensorflow.transformer.utils
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 2048
13
+ d_kv = 64
14
+ d_model = 512
15
+ dropout_rate = 0.0
16
+ inputs_length = 512
17
+ mean_noise_span_length = 3.0
18
+ MIXTURE_NAME = 'c4_v220_unsupervised'
19
+ noise_density = 0.15
20
+ num_heads = 8
21
+
22
+ # Parameters for adafactor_decay_rate_pow:
23
+ # ==============================================================================
24
+ adafactor_decay_rate_pow.offset = 0
25
+
26
+ # Parameters for AdafactorOptimizer:
27
+ # ==============================================================================
28
+ AdafactorOptimizer.beta1 = 0.0
29
+ AdafactorOptimizer.clipping_threshold = 1.0
30
+ AdafactorOptimizer.decay_rate = None
31
+ AdafactorOptimizer.epsilon1 = 1e-30
32
+ AdafactorOptimizer.epsilon2 = 0.001
33
+ AdafactorOptimizer.factored = True
34
+ AdafactorOptimizer.min_dim_size_to_factor = 128
35
+ AdafactorOptimizer.multiply_by_parameter_scale = True
36
+
37
+ # Parameters for Bitransformer:
38
+ # ==============================================================================
39
+ Bitransformer.shared_embedding = True
40
+
41
+ # Parameters for denoise:
42
+ # ==============================================================================
43
+ denoise.inputs_fn = @preprocessors.noise_span_to_unique_sentinel
44
+ denoise.noise_density = %noise_density
45
+ denoise.noise_mask_fn = @preprocessors.random_spans_noise_mask
46
+ denoise.targets_fn = @preprocessors.nonnoise_span_to_unique_sentinel
47
+
48
+ # Parameters for decoder/DenseReluDense:
49
+ # ==============================================================================
50
+ decoder/DenseReluDense.activation = 'relu'
51
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
52
+ decoder/DenseReluDense.hidden_size = %d_ff
53
+ decoder/DenseReluDense.use_bias = False
54
+
55
+ # Parameters for encoder/DenseReluDense:
56
+ # ==============================================================================
57
+ encoder/DenseReluDense.activation = 'relu'
58
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
59
+ encoder/DenseReluDense.hidden_size = %d_ff
60
+ encoder/DenseReluDense.use_bias = False
61
+
62
+ # Parameters for enc_dec_attention:
63
+ # ==============================================================================
64
+ # None.
65
+
66
+ # Parameters for enc_dec_attention_bias:
67
+ # ==============================================================================
68
+ # None.
69
+
70
+ # Parameters for decoder/EncDecAttention:
71
+ # ==============================================================================
72
+ decoder/EncDecAttention.relative_attention_type = None
73
+
74
+ # Parameters for get_variable_dtype:
75
+ # ==============================================================================
76
+ get_variable_dtype.activation_dtype = 'bfloat16'
77
+
78
+ # Parameters for get_vocab_embedding_cls:
79
+ # ==============================================================================
80
+ # None.
81
+
82
+ # Parameters for get_vocabulary:
83
+ # ==============================================================================
84
+ get_vocabulary.mixture_or_task_name = %MIXTURE_NAME
85
+
86
+ # Parameters for decoder/LayerStack:
87
+ # ==============================================================================
88
+ decoder/LayerStack.dropout_rate = None
89
+ decoder/LayerStack.norm_epsilon = None
90
+ decoder/LayerStack.recompute_grads = False
91
+ decoder/LayerStack.sublayers_final = \
92
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
93
+ decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
94
+ decoder/LayerStack.sublayers_per_layer = \
95
+ [@transformer.sublayer_rms_norm,
96
+ @transformer.sublayer_call_layer,
97
+ @transformer.sublayer_dropout,
98
+ @transformer.sublayer_residual]
99
+
100
+ # Parameters for encoder/LayerStack:
101
+ # ==============================================================================
102
+ encoder/LayerStack.dropout_rate = None
103
+ encoder/LayerStack.norm_epsilon = None
104
+ encoder/LayerStack.recompute_grads = False
105
+ encoder/LayerStack.sublayers_final = \
106
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
107
+ encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
108
+ encoder/LayerStack.sublayers_per_layer = \
109
+ [@transformer.sublayer_rms_norm,
110
+ @transformer.sublayer_call_layer,
111
+ @transformer.sublayer_dropout,
112
+ @transformer.sublayer_residual]
113
+
114
+ # Parameters for learning_rate_schedule_noam:
115
+ # ==============================================================================
116
+ learning_rate_schedule_noam.linear_decay_fraction = 0.0
117
+ learning_rate_schedule_noam.multiplier = 1.0
118
+ learning_rate_schedule_noam.offset = 0
119
+ learning_rate_schedule_noam.warmup_steps = 10000
120
+
121
+ # Parameters for make_bitransformer:
122
+ # ==============================================================================
123
+ make_bitransformer.decoder_name = 'decoder'
124
+ make_bitransformer.encoder_name = 'encoder'
125
+
126
+ # Parameters for decoder/make_layer_stack:
127
+ # ==============================================================================
128
+ decoder/make_layer_stack.block_scope = True
129
+ decoder/make_layer_stack.layers = \
130
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
131
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
132
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
133
+ decoder/make_layer_stack.num_layers = 2
134
+
135
+ # Parameters for encoder/make_layer_stack:
136
+ # ==============================================================================
137
+ encoder/make_layer_stack.block_scope = True
138
+ encoder/make_layer_stack.layers = \
139
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
140
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
141
+ encoder/make_layer_stack.num_layers = 8
142
+
143
+ # Parameters for mesh_train_dataset_fn:
144
+ # ==============================================================================
145
+ mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
146
+ mesh_train_dataset_fn.pack = True
147
+ mesh_train_dataset_fn.seed = None
148
+ mesh_train_dataset_fn.shuffle = True
149
+ mesh_train_dataset_fn.use_cached = 1
150
+
151
+ # Parameters for noise_span_to_unique_sentinel:
152
+ # ==============================================================================
153
+ # None.
154
+
155
+ # Parameters for nonnoise_span_to_unique_sentinel:
156
+ # ==============================================================================
157
+ # None.
158
+
159
+ # Parameters for pack_dataset:
160
+ # ==============================================================================
161
+ pack_dataset.use_custom_ops = True
162
+
163
+ # Parameters for pack_or_pad:
164
+ # ==============================================================================
165
+ # None.
166
+
167
+ # Parameters for random_spans_helper:
168
+ # ==============================================================================
169
+ random_spans_helper.extra_tokens_per_span_inputs = 1
170
+ random_spans_helper.extra_tokens_per_span_targets = 1
171
+ random_spans_helper.inputs_length = %inputs_length
172
+ random_spans_helper.mean_noise_span_length = %mean_noise_span_length
173
+ random_spans_helper.noise_density = %noise_density
174
+ random_spans_helper.verbose = False
175
+
176
+ # Parameters for random_spans_noise_mask:
177
+ # ==============================================================================
178
+ random_spans_noise_mask.mean_noise_span_length = %mean_noise_span_length
179
+
180
+ # Parameters for random_spans_tokens_length:
181
+ # ==============================================================================
182
+ # None.
183
+
184
+ # Parameters for reduce_concat_tokens:
185
+ # ==============================================================================
186
+ reduce_concat_tokens.batch_size = 128
187
+ reduce_concat_tokens.feature_key = 'targets'
188
+
189
+ # Parameters for rewrite_stack_variables:
190
+ # ==============================================================================
191
+ rewrite_stack_variables.max_combined_variable_size = 536870912
192
+
193
+ # Parameters for run:
194
+ # ==============================================================================
195
+ run.autostack = True
196
+ run.batch_size = ('tokens_per_batch', 65536)
197
+ run.checkpoint_input_pipeline = False
198
+ run.dataset_split = 'train'
199
+ run.ensemble_inputs = None
200
+ run.eval_checkpoint_step = None
201
+ run.eval_dataset_fn = None
202
+ run.eval_summary_dir = None
203
+ run.export_checkpoint_step = None
204
+ run.export_path = ''
205
+ run.init_checkpoint = None
206
+ run.iterations_per_loop = 100
207
+ run.keep_checkpoint_max = None
208
+ run.layout_rules = \
209
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
210
+ run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam
211
+ run.mesh_devices = None
212
+ run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
213
+ run.mode = 'train'
214
+ run.model_type = 'bitransformer'
215
+ run.optimizer = @optimize.AdafactorOptimizer
216
+ run.output_eval_examples = True
217
+ run.perplexity_eval_steps = 100
218
+ run.predict_fn = None
219
+ run.save_checkpoints_steps = 5000
220
+ run.seen_data_init_step = 0
221
+ run.sequence_length = {'inputs': 512, 'targets': 128}
222
+ run.skip_seen_data = False
223
+ run.total_run_steps = None
224
+ run.train_dataset_fn = @t5.models.mesh_transformer.mesh_train_dataset_fn
225
+ run.train_steps = 524288
226
+ run.variable_filter = None
227
+
228
+ # Parameters for select_random_chunk:
229
+ # ==============================================================================
230
+ select_random_chunk.additional_feature_keys = None
231
+ select_random_chunk.additional_passthrough_keys = None
232
+ select_random_chunk.feature_key = 'targets'
233
+ select_random_chunk.max_length = 65536
234
+ select_random_chunk.uniform_random_start = False
235
+
236
+ # Parameters for decoder/SelfAttention:
237
+ # ==============================================================================
238
+ decoder/SelfAttention.attention_func = None
239
+ decoder/SelfAttention.attention_kwargs = None
240
+ decoder/SelfAttention.combine_dims = True
241
+ decoder/SelfAttention.dropout_rate = %dropout_rate
242
+ decoder/SelfAttention.fold_scaling_into_initializer = True
243
+ decoder/SelfAttention.keep_query_heads_dims = False
244
+ decoder/SelfAttention.key_value_size = %d_kv
245
+ decoder/SelfAttention.num_heads = %num_heads
246
+ decoder/SelfAttention.num_memory_heads = 0
247
+ decoder/SelfAttention.relative_attention_num_buckets = 32
248
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
249
+ decoder/SelfAttention.shared_kv = False
250
+
251
+ # Parameters for encoder/SelfAttention:
252
+ # ==============================================================================
253
+ encoder/SelfAttention.attention_func = None
254
+ encoder/SelfAttention.attention_kwargs = None
255
+ encoder/SelfAttention.combine_dims = True
256
+ encoder/SelfAttention.dropout_rate = %dropout_rate
257
+ encoder/SelfAttention.fold_scaling_into_initializer = True
258
+ encoder/SelfAttention.keep_query_heads_dims = False
259
+ encoder/SelfAttention.key_value_size = %d_kv
260
+ encoder/SelfAttention.num_heads = %num_heads
261
+ encoder/SelfAttention.num_memory_heads = 0
262
+ encoder/SelfAttention.relative_attention_num_buckets = 32
263
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
264
+ encoder/SelfAttention.shared_kv = False
265
+
266
+ # Parameters for serialize_num_microbatches:
267
+ # ==============================================================================
268
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
269
+
270
+ # Parameters for SimdMeshImpl:
271
+ # ==============================================================================
272
+ SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8
273
+
274
+ # Parameters for split_tokens:
275
+ # ==============================================================================
276
+ split_tokens.additional_feature_keys = None
277
+ split_tokens.feature_key = 'targets'
278
+ split_tokens.max_tokens_per_segment = @preprocessors.random_spans_tokens_length()
279
+ split_tokens.min_tokens_per_segment = None
280
+ split_tokens.passthrough_feature_keys = None
281
+
282
+ # Parameters for sublayer_call_layer:
283
+ # ==============================================================================
284
+ # None.
285
+
286
+ # Parameters for sublayer_dropout:
287
+ # ==============================================================================
288
+ sublayer_dropout.dropout_rate = %dropout_rate
289
+
290
+ # Parameters for sublayer_mask_padding:
291
+ # ==============================================================================
292
+ # None.
293
+
294
+ # Parameters for sublayer_residual:
295
+ # ==============================================================================
296
+ # None.
297
+
298
+ # Parameters for sublayer_rms_norm:
299
+ # ==============================================================================
300
+ sublayer_rms_norm.epsilon = 1e-06
301
+ sublayer_rms_norm.name = 'rms_norm'
302
+
303
+ # Parameters for tpu_estimator_model_fn:
304
+ # ==============================================================================
305
+ tpu_estimator_model_fn.hierarchical_tiling_spec = None
306
+ tpu_estimator_model_fn.init_variable_filter = ''
307
+ tpu_estimator_model_fn.model_info_file = ''
308
+ tpu_estimator_model_fn.outer_batch_size = 1
309
+ tpu_estimator_model_fn.tpu_summaries = False
310
+
311
+ # Parameters for tpu_mesh_shape:
312
+ # ==============================================================================
313
+ tpu_mesh_shape.ensemble_parallelism = None
314
+ tpu_mesh_shape.model_parallelism = 1
315
+ tpu_mesh_shape.tpu_topology = '4x4'
316
+
317
+ # Parameters for unit_scaling_convention:
318
+ # ==============================================================================
319
+ unit_scaling_convention.value = False
320
+
321
+ # Parameters for decoder/Unitransformer:
322
+ # ==============================================================================
323
+ decoder/Unitransformer.d_model = %d_model
324
+ decoder/Unitransformer.ensemble = None
325
+ decoder/Unitransformer.input_full_attention = False
326
+ decoder/Unitransformer.label_smoothing = 0.0
327
+ decoder/Unitransformer.loss_denominator = None
328
+ decoder/Unitransformer.loss_fn = None
329
+ decoder/Unitransformer.loss_on_targets_only = False
330
+ decoder/Unitransformer.max_length = 512
331
+ decoder/Unitransformer.positional_embedding = False
332
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
333
+ decoder/Unitransformer.sinusoid_positional_embedding = False
334
+ decoder/Unitransformer.token_dropout_rate = 0.0
335
+ decoder/Unitransformer.vocab_divisor = 128
336
+ decoder/Unitransformer.z_loss = 0.0001
337
+
338
+ # Parameters for encoder/Unitransformer:
339
+ # ==============================================================================
340
+ encoder/Unitransformer.d_model = %d_model
341
+ encoder/Unitransformer.ensemble = None
342
+ encoder/Unitransformer.input_full_attention = False
343
+ encoder/Unitransformer.label_smoothing = 0.0
344
+ encoder/Unitransformer.loss_denominator = None
345
+ encoder/Unitransformer.loss_fn = None
346
+ encoder/Unitransformer.loss_on_targets_only = False
347
+ encoder/Unitransformer.max_length = 512
348
+ encoder/Unitransformer.positional_embedding = False
349
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
350
+ encoder/Unitransformer.sinusoid_positional_embedding = False
351
+ encoder/Unitransformer.token_dropout_rate = 0.0
352
+ encoder/Unitransformer.vocab_divisor = 128
353
+ encoder/Unitransformer.z_loss = 0.0001
354
+
355
+ # Parameters for unsupervised:
356
+ # ==============================================================================
357
+ unsupervised.preprocessors = \
358
+ [@preprocessors.select_random_chunk,
359
+ @preprocessors.reduce_concat_tokens,
360
+ @preprocessors.split_tokens,
361
+ @preprocessors.denoise]
362
+
363
+ # Parameters for VarianceScalingInitializer:
364
+ # ==============================================================================
365
+ VarianceScalingInitializer.distribution = 'normal'
366
+ VarianceScalingInitializer.mode = 'fan_in'
367
+ VarianceScalingInitializer.scale = 1.0
368
+
369
+ # Parameters for VocabEmbedding:
370
+ # ==============================================================================
371
+ VocabEmbedding.scale_variable_like_classifier_weights = False