pere commited on
Commit
7eb7be0
1 Parent(s): b025910

first complete

Browse files
Files changed (5) hide show
  1. README.md +10 -0
  2. config.json +3 -1
  3. operative_config.gin +296 -0
  4. pytorch_model.bin +3 -0
  5. tokenizer.json +0 -0
README.md CHANGED
@@ -28,5 +28,15 @@ After creating the T5X model, the model is converted to Huggingface Flax by a mo
28
  python3 convert_t5_checkpoint_to_flax.py --t5x_checkpoint_path /tmp/t5x_checkpoints/t5_small/checkpoint_1000000/ --flax_dump_folder_path /tmp/flax_dump_folder/ --config_name t5-small
29
  ```
30
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
 
28
  python3 convert_t5_checkpoint_to_flax.py --t5x_checkpoint_path /tmp/t5x_checkpoints/t5_small/checkpoint_1000000/ --flax_dump_folder_path /tmp/flax_dump_folder/ --config_name t5-small
29
  ```
30
 
31
+ The tokenizer.json was copied from https://huggingface.co/t5-small/blob/main/tokenizer.json.
32
+
33
+ To be able to use the widgets in HuggingFace, the model was converted to pyTorch by running:
34
+ ```python
35
+ from transformers import T5ForConditionalGeneration
36
+ model = T5ForConditionalGeneration.from_pretrained(".", from_flax=True)
37
+ model.save_pretrained(".")
38
+ ```
39
+
40
+
41
 
42
 
config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
 
2
  "architectures": [
3
- "T5Model"
4
  ],
5
  "d_ff": 2048,
6
  "d_kv": 64,
@@ -49,6 +50,7 @@
49
  "prefix": "translate English to Romanian: "
50
  }
51
  },
 
52
  "transformers_version": "4.16.2",
53
  "use_cache": true,
54
  "vocab_size": 32128
 
1
  {
2
+ "_name_or_path": ".",
3
  "architectures": [
4
+ "T5ForConditionalGeneration"
5
  ],
6
  "d_ff": 2048,
7
  "d_kv": 64,
 
50
  "prefix": "translate English to Romanian: "
51
  }
52
  },
53
+ "torch_dtype": "float32",
54
  "transformers_version": "4.16.2",
55
  "use_cache": true,
56
  "vocab_size": 32128
operative_config.gin ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import t5.models.mesh_transformer
2
+ import t5.data.sentencepiece_vocabulary
3
+ import mesh_tensorflow.optimize
4
+ import mesh_tensorflow.transformer.dataset
5
+ import mesh_tensorflow.transformer.learning_rate_schedules
6
+ import mesh_tensorflow.transformer.t2t_vocabulary
7
+ import mesh_tensorflow.transformer.transformer_layers
8
+ import mesh_tensorflow.transformer.utils
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 2048
13
+ d_kv = 64
14
+ d_model = 512
15
+ dropout_rate = 0.1
16
+ inputs_length = 512
17
+ mean_noise_span_length = 3.0
18
+ MIXTURE_NAME = 'all_mix'
19
+ noise_density = 0.15
20
+ num_heads = 8
21
+ num_layers = 6
22
+ targets_length = 512
23
+ init_checkpoint = "gs://t5-data/pretrained_models/small/model.ckpt-1000000"
24
+ tokens_per_batch = 1048576
25
+
26
+ # Parameters for AdafactorOptimizer:
27
+ # ==============================================================================
28
+ AdafactorOptimizer.beta1 = 0.0
29
+ AdafactorOptimizer.clipping_threshold = 1.0
30
+ AdafactorOptimizer.decay_rate = None
31
+ AdafactorOptimizer.epsilon1 = 1e-30
32
+ AdafactorOptimizer.epsilon2 = 0.001
33
+ AdafactorOptimizer.factored = True
34
+ AdafactorOptimizer.min_dim_size_to_factor = 128
35
+ AdafactorOptimizer.multiply_by_parameter_scale = True
36
+
37
+ # Parameters for Bitransformer:
38
+ # ==============================================================================
39
+ Bitransformer.shared_embedding = True
40
+
41
+ # Parameters for denoise:
42
+ # ==============================================================================
43
+ denoise.inputs_fn = @preprocessors.noise_span_to_unique_sentinel
44
+ denoise.noise_density = %noise_density
45
+ denoise.noise_mask_fn = @preprocessors.random_spans_noise_mask
46
+ denoise.targets_fn = @preprocessors.nonnoise_span_to_unique_sentinel
47
+
48
+ # Parameters for decoder/DenseReluDense:
49
+ # ==============================================================================
50
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
51
+ decoder/DenseReluDense.hidden_size = %d_ff
52
+
53
+ # Parameters for encoder/DenseReluDense:
54
+ # ==============================================================================
55
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
56
+ encoder/DenseReluDense.hidden_size = %d_ff
57
+
58
+ # Parameters for decoder/EncDecAttention:
59
+ # ==============================================================================
60
+ # None.
61
+
62
+ # Parameters for get_sentencepiece_model_path:
63
+ # ==============================================================================
64
+ get_sentencepiece_model_path.mixture_or_task_name = %MIXTURE_NAME
65
+
66
+ # Parameters for get_variable_dtype:
67
+ # ==============================================================================
68
+ get_variable_dtype.activation_dtype = 'bfloat16'
69
+
70
+ # Parameters for decoder/LayerStack:
71
+ # ==============================================================================
72
+ decoder/LayerStack.dropout_rate = %dropout_rate
73
+ decoder/LayerStack.norm_epsilon = 1e-06
74
+
75
+ # Parameters for encoder/LayerStack:
76
+ # ==============================================================================
77
+ encoder/LayerStack.dropout_rate = %dropout_rate
78
+ encoder/LayerStack.norm_epsilon = 1e-06
79
+
80
+ # Parameters for learning_rate_schedule_noam:
81
+ # ==============================================================================
82
+ learning_rate_schedule_noam.linear_decay_fraction = 0.1
83
+ learning_rate_schedule_noam.multiplier = 1.0
84
+ learning_rate_schedule_noam.offset = 0
85
+ learning_rate_schedule_noam.warmup_steps = 10000
86
+
87
+ # Parameters for make_bitransformer:
88
+ # ==============================================================================
89
+ make_bitransformer.decoder_name = 'decoder'
90
+ make_bitransformer.encoder_name = 'encoder'
91
+
92
+ # Parameters for decoder/make_layer_stack:
93
+ # ==============================================================================
94
+ decoder/make_layer_stack.block_scope = True
95
+ decoder/make_layer_stack.layers = \
96
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
97
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
98
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
99
+ decoder/make_layer_stack.num_layers = %num_layers
100
+
101
+ # Parameters for encoder/make_layer_stack:
102
+ # ==============================================================================
103
+ encoder/make_layer_stack.block_scope = True
104
+ encoder/make_layer_stack.layers = \
105
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
106
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
107
+ encoder/make_layer_stack.num_layers = %num_layers
108
+
109
+ # Parameters for mesh_train_dataset_fn:
110
+ # ==============================================================================
111
+ mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
112
+
113
+
114
+ # Parameters for noise_span_to_unique_sentinel:
115
+ # ==============================================================================
116
+ # None.
117
+
118
+ # Parameters for nonnoise_span_to_unique_sentinel:
119
+ # ==============================================================================
120
+ # None.
121
+
122
+ # Parameters for pack_dataset:
123
+ # ==============================================================================
124
+
125
+
126
+ # Parameters for pack_or_pad:
127
+ # ==============================================================================
128
+ # None.
129
+
130
+ # Parameters for random_spans_helper:
131
+ # ==============================================================================
132
+ random_spans_helper.extra_tokens_per_span_inputs = 1
133
+ random_spans_helper.extra_tokens_per_span_targets = 1
134
+ random_spans_helper.inputs_length = %inputs_length
135
+ random_spans_helper.mean_noise_span_length = %mean_noise_span_length
136
+ random_spans_helper.noise_density = %noise_density
137
+
138
+ # Parameters for targets_length/random_spans_helper:
139
+ # ==============================================================================
140
+ targets_length/random_spans_helper.extra_tokens_per_span_inputs = 1
141
+ targets_length/random_spans_helper.extra_tokens_per_span_targets = 1
142
+ targets_length/random_spans_helper.inputs_length = %inputs_length
143
+ targets_length/random_spans_helper.mean_noise_span_length = %mean_noise_span_length
144
+ targets_length/random_spans_helper.noise_density = %noise_density
145
+
146
+ # Parameters for random_spans_noise_mask:
147
+ # ==============================================================================
148
+ random_spans_noise_mask.mean_noise_span_length = %mean_noise_span_length
149
+
150
+ # Parameters for targets_length/random_spans_targets_length:
151
+ # ==============================================================================
152
+ # None.
153
+
154
+ # Parameters for random_spans_tokens_length:
155
+ # ==============================================================================
156
+ # None.
157
+
158
+ # Parameters for rate_num_examples:
159
+ # ==============================================================================
160
+ rate_num_examples.maximum = 1000000.0
161
+ rate_num_examples.scale = 1.0
162
+ rate_num_examples.temperature = 1.0
163
+
164
+ # Parameters for rate_unsupervised:
165
+ # ==============================================================================
166
+ rate_unsupervised.value = 710000.0
167
+
168
+ # Parameters for reduce_concat_tokens:
169
+ # ==============================================================================
170
+ reduce_concat_tokens.batch_size = 128
171
+ reduce_concat_tokens.feature_key = 'targets'
172
+
173
+ # Parameters for run:
174
+ # ==============================================================================
175
+ run.autostack = True
176
+ run.batch_size = ('tokens_per_batch', %tokens_per_batch)
177
+ run.dataset_split = 'train'
178
+ run.ensemble_inputs = None
179
+ run.eval_checkpoint_step = None
180
+ run.eval_dataset_fn = None
181
+ run.eval_summary_dir = None
182
+ run.export_path = ''
183
+ run.iterations_per_loop = 100
184
+ run.keep_checkpoint_max = None
185
+ run.layout_rules = \
186
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
187
+ run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam
188
+ run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
189
+ run.mode = 'train'
190
+ run.init_checkpoint = %init_checkpoint
191
+ run.model_type = 'bitransformer'
192
+ run.optimizer = @optimize.AdafactorOptimizer
193
+ run.perplexity_eval_steps = 10
194
+ run.predict_fn = None
195
+ run.save_checkpoints_steps = 2400
196
+ run.sequence_length = {'inputs': %inputs_length, 'targets': %targets_length}
197
+ run.train_dataset_fn = \
198
+ @t5.models.mesh_transformer.mesh_train_dataset_fn
199
+ run.train_steps = 1000000000
200
+ run.variable_filter = None
201
+ run.vocabulary = \
202
+ @t5.data.sentencepiece_vocabulary.SentencePieceVocabulary()
203
+
204
+ # Parameters for select_random_chunk:
205
+ # ==============================================================================
206
+ select_random_chunk.feature_key = 'targets'
207
+ select_random_chunk.max_length = 65536
208
+
209
+ # Parameters for decoder/SelfAttention:
210
+ # ==============================================================================
211
+ decoder/SelfAttention.attention_kwargs = None
212
+ decoder/SelfAttention.dropout_rate = %dropout_rate
213
+ decoder/SelfAttention.key_value_size = %d_kv
214
+ decoder/SelfAttention.num_heads = %num_heads
215
+ decoder/SelfAttention.num_memory_heads = 0
216
+ decoder/SelfAttention.relative_attention_num_buckets = 32
217
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
218
+ decoder/SelfAttention.shared_kv = False
219
+
220
+ # Parameters for encoder/SelfAttention:
221
+ # ==============================================================================
222
+ encoder/SelfAttention.attention_kwargs = None
223
+ encoder/SelfAttention.dropout_rate = %dropout_rate
224
+ encoder/SelfAttention.key_value_size = %d_kv
225
+ encoder/SelfAttention.num_heads = %num_heads
226
+ encoder/SelfAttention.num_memory_heads = 0
227
+ encoder/SelfAttention.relative_attention_num_buckets = 32
228
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
229
+ encoder/SelfAttention.shared_kv = False
230
+
231
+ # Parameters for SentencePieceVocabulary:
232
+ # ==============================================================================
233
+ SentencePieceVocabulary.extra_ids = 100
234
+ SentencePieceVocabulary.sentencepiece_model_file = \
235
+ @t5.models.mesh_transformer.get_sentencepiece_model_path()
236
+
237
+ # Parameters for serialize_num_microbatches:
238
+ # ==============================================================================
239
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
240
+
241
+ # Parameters for split_tokens:
242
+ # ==============================================================================
243
+ split_tokens.feature_key = 'targets'
244
+ split_tokens.max_tokens_per_segment = @preprocessors.random_spans_tokens_length()
245
+ split_tokens.min_tokens_per_segment = None
246
+
247
+ # Parameters for tpu_estimator_model_fn:
248
+ # ==============================================================================
249
+ tpu_estimator_model_fn.init_checkpoint = %init_checkpoint
250
+ tpu_estimator_model_fn.outer_batch_size = 1
251
+ tpu_estimator_model_fn.tpu_summaries = False
252
+
253
+ # Parameters for tpu_mesh_shape:
254
+ # ==============================================================================
255
+ tpu_mesh_shape.ensemble_parallelism = None
256
+ tpu_mesh_shape.model_parallelism = 1
257
+ tpu_mesh_shape.tpu_topology = '8x8'
258
+
259
+ # Parameters for decoder/Unitransformer:
260
+ # ==============================================================================
261
+ decoder/Unitransformer.d_model = %d_model
262
+ decoder/Unitransformer.ensemble = None
263
+ decoder/Unitransformer.input_full_attention = False
264
+ decoder/Unitransformer.label_smoothing = 0.0
265
+ decoder/Unitransformer.loss_denominator = None
266
+ decoder/Unitransformer.loss_fn = None
267
+ decoder/Unitransformer.loss_on_targets_only = False
268
+ decoder/Unitransformer.max_length = 512
269
+ decoder/Unitransformer.positional_embedding = False
270
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
271
+ decoder/Unitransformer.vocab_divisor = 128
272
+ decoder/Unitransformer.z_loss = 0.0001
273
+ decoder/Unitransformer.loss_denominator = 233472
274
+
275
+ # Parameters for encoder/Unitransformer:
276
+ # ==============================================================================
277
+ encoder/Unitransformer.d_model = %d_model
278
+ encoder/Unitransformer.ensemble = None
279
+ encoder/Unitransformer.input_full_attention = False
280
+ encoder/Unitransformer.label_smoothing = 0.0
281
+ encoder/Unitransformer.loss_denominator = None
282
+ encoder/Unitransformer.loss_fn = None
283
+ encoder/Unitransformer.loss_on_targets_only = False
284
+ encoder/Unitransformer.max_length = 512
285
+ encoder/Unitransformer.positional_embedding = False
286
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
287
+ encoder/Unitransformer.vocab_divisor = 128
288
+ encoder/Unitransformer.z_loss = 0.0001
289
+
290
+ # Parameters for unsupervised:
291
+ # ==============================================================================
292
+ unsupervised.preprocessors = \
293
+ [@preprocessors.select_random_chunk,
294
+ @preprocessors.reduce_concat_tokens,
295
+ @preprocessors.split_tokens,
296
+ @preprocessors.denoise]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6813c045abe80ab44d4f52988d5f1ecf7f6ce6f07bd1f4eb117c3299ef0b217
3
+ size 242085986
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff