patrickvonplaten commited on
Commit
be0c125
1 Parent(s): 2598166

Add config

Browse files
Files changed (2) hide show
  1. config.json +4 -4
  2. operative_config.gin +6 -8
config.json CHANGED
@@ -3,9 +3,9 @@
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
6
- "d_ff": 16384,
7
- "d_kv": 32,
8
- "d_model": 256,
9
  "decoder_start_token_id": 0,
10
  "dropout_rate": 0.1,
11
  "eos_token_id": 1,
@@ -16,7 +16,7 @@
16
  "model_type": "t5",
17
  "n_positions": 512,
18
  "num_decoder_layers": 12,
19
- "num_heads": 4,
20
  "num_layers": 12,
21
  "pad_token_id": 0,
22
  "relative_attention_num_buckets": 32,
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
6
+ "d_ff": 3072,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
  "decoder_start_token_id": 0,
10
  "dropout_rate": 0.1,
11
  "eos_token_id": 1,
16
  "model_type": "t5",
17
  "n_positions": 512,
18
  "num_decoder_layers": 12,
19
+ "num_heads": 12,
20
  "num_layers": 12,
21
  "pad_token_id": 0,
22
  "relative_attention_num_buckets": 32,
operative_config.gin CHANGED
@@ -9,15 +9,15 @@ import t5.models.mesh_transformer
9
 
10
  # Macros:
11
  # ==============================================================================
12
- d_ff = 16384
13
- d_kv = 32
14
- d_model = 256
15
  dropout_rate = 0.0
16
  inputs_length = 512
17
  mean_noise_span_length = 3.0
18
  MIXTURE_NAME = 'c4_v220_unsupervised'
19
  noise_density = 0.15
20
- num_heads = 4
21
  num_layers = 12
22
 
23
  # Parameters for adafactor_decay_rate_pow:
@@ -146,7 +146,6 @@ encoder/make_layer_stack.num_layers = %num_layers
146
  mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
147
  mesh_train_dataset_fn.pack = True
148
  mesh_train_dataset_fn.seed = None
149
- mesh_train_dataset_fn.shuffle = True
150
  mesh_train_dataset_fn.use_cached = 1
151
 
152
  # Parameters for noise_span_to_unique_sentinel:
@@ -195,7 +194,6 @@ rewrite_stack_variables.max_combined_variable_size = 536870912
195
  # ==============================================================================
196
  run.autostack = True
197
  run.batch_size = ('tokens_per_batch', 65536)
198
- run.checkpoint_input_pipeline = False
199
  run.dataset_split = 'train'
200
  run.ensemble_inputs = None
201
  run.eval_checkpoint_step = None
@@ -217,7 +215,7 @@ run.optimizer = @optimize.AdafactorOptimizer
217
  run.output_eval_examples = True
218
  run.perplexity_eval_steps = 100
219
  run.predict_fn = None
220
- run.save_checkpoints_steps = 5000
221
  run.seen_data_init_step = 0
222
  run.sequence_length = {'inputs': 512, 'targets': 128}
223
  run.skip_seen_data = False
@@ -312,7 +310,7 @@ tpu_estimator_model_fn.tpu_summaries = False
312
  # Parameters for tpu_mesh_shape:
313
  # ==============================================================================
314
  tpu_mesh_shape.ensemble_parallelism = None
315
- tpu_mesh_shape.model_parallelism = 2
316
  tpu_mesh_shape.tpu_topology = '4x4'
317
 
318
  # Parameters for unit_scaling_convention:
9
 
10
  # Macros:
11
  # ==============================================================================
12
+ d_ff = 3072
13
+ d_kv = 64
14
+ d_model = 768
15
  dropout_rate = 0.0
16
  inputs_length = 512
17
  mean_noise_span_length = 3.0
18
  MIXTURE_NAME = 'c4_v220_unsupervised'
19
  noise_density = 0.15
20
+ num_heads = 12
21
  num_layers = 12
22
 
23
  # Parameters for adafactor_decay_rate_pow:
146
  mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
147
  mesh_train_dataset_fn.pack = True
148
  mesh_train_dataset_fn.seed = None
 
149
  mesh_train_dataset_fn.use_cached = 1
150
 
151
  # Parameters for noise_span_to_unique_sentinel:
194
  # ==============================================================================
195
  run.autostack = True
196
  run.batch_size = ('tokens_per_batch', 65536)
 
197
  run.dataset_split = 'train'
198
  run.ensemble_inputs = None
199
  run.eval_checkpoint_step = None
215
  run.output_eval_examples = True
216
  run.perplexity_eval_steps = 100
217
  run.predict_fn = None
218
+ run.save_checkpoints_steps = 10000
219
  run.seen_data_init_step = 0
220
  run.sequence_length = {'inputs': 512, 'targets': 128}
221
  run.skip_seen_data = False
310
  # Parameters for tpu_mesh_shape:
311
  # ==============================================================================
312
  tpu_mesh_shape.ensemble_parallelism = None
313
+ tpu_mesh_shape.model_parallelism = 1
314
  tpu_mesh_shape.tpu_topology = '4x4'
315
 
316
  # Parameters for unit_scaling_convention: