Upload folder using huggingface_hub

#5
.gitattributes CHANGED
@@ -7,3 +7,5 @@
7
  *.ot filter=lfs diff=lfs merge=lfs -text
8
  *.onnx filter=lfs diff=lfs merge=lfs -text
9
  model.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
7
  *.ot filter=lfs diff=lfs merge=lfs -text
8
  *.onnx filter=lfs diff=lfs merge=lfs -text
9
  model.safetensors filter=lfs diff=lfs merge=lfs -text
10
+ model.ckpt-1229942.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
11
+ model.ckpt-1229942.meta filter=lfs diff=lfs merge=lfs -text
model.ckpt-1229942.data-00000-of-00002 ADDED
Binary file (8 Bytes). View file
 
model.ckpt-1229942.data-00001-of-00002 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30a04543eb97388e5aac909a6c8c61da095865406e559e1b6a9549d149f765e7
3
+ size 447754240
model.ckpt-1229942.index ADDED
Binary file (10.9 kB). View file
 
model.ckpt-1229942.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b5a5aa8921b5eb418d2f1fcd0eccd618b75a99d86da56da7b605a8b97ecc304
3
+ size 20836297
operative_config.gin ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer_layers
6
+ import mesh_tensorflow.transformer.utils
7
+ import t5.data.sentencepiece_vocabulary
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 3072
13
+ d_kv = 64
14
+ d_model = 768
15
+ dropout_rate = 0.1
16
+ init_checkpoint = 'gs://t5-data/pretrained_models/base/model.ckpt-999900'
17
+ MIXTURE_NAME = 'all_mix'
18
+ noise_density = 0.15
19
+ num_heads = 12
20
+ num_layers = 12
21
+
22
+ # Parameters for AdafactorOptimizer:
23
+ # ==============================================================================
24
+ AdafactorOptimizer.beta1 = 0.0
25
+ AdafactorOptimizer.clipping_threshold = 1.0
26
+ AdafactorOptimizer.decay_rate = None
27
+ AdafactorOptimizer.epsilon1 = 1e-30
28
+ AdafactorOptimizer.epsilon2 = 0.001
29
+ AdafactorOptimizer.factored = True
30
+ AdafactorOptimizer.min_dim_size_to_factor = 128
31
+ AdafactorOptimizer.multiply_by_parameter_scale = True
32
+
33
+ # Parameters for Bitransformer:
34
+ # ==============================================================================
35
+ Bitransformer.shared_embedding = True
36
+
37
+ # Parameters for denoise:
38
+ # ==============================================================================
39
+ # None.
40
+
41
+ # Parameters for decoder/DenseReluDense:
42
+ # ==============================================================================
43
+ decoder/DenseReluDense.activation = 'relu'
44
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
45
+ decoder/DenseReluDense.hidden_size = %d_ff
46
+
47
+ # Parameters for encoder/DenseReluDense:
48
+ # ==============================================================================
49
+ encoder/DenseReluDense.activation = 'relu'
50
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
51
+ encoder/DenseReluDense.hidden_size = %d_ff
52
+
53
+ # Parameters for decoder/EncDecAttention:
54
+ # ==============================================================================
55
+ # None.
56
+
57
+ # Parameters for get_variable_dtype:
58
+ # ==============================================================================
59
+ get_variable_dtype.activation_dtype = 'bfloat16'
60
+
61
+ # Parameters for get_vocab_embedding_cls:
62
+ # ==============================================================================
63
+ # None.
64
+
65
+ # Parameters for get_vocabulary:
66
+ # ==============================================================================
67
+ # None.
68
+
69
+ # Parameters for iid_noise_mask:
70
+ # ==============================================================================
71
+ # None.
72
+
73
+ # Parameters for decoder/LayerStack:
74
+ # ==============================================================================
75
+ decoder/LayerStack.dropout_rate = %dropout_rate
76
+ decoder/LayerStack.norm_epsilon = 1e-06
77
+ decoder/LayerStack.recompute_grads = False
78
+
79
+ # Parameters for encoder/LayerStack:
80
+ # ==============================================================================
81
+ encoder/LayerStack.dropout_rate = %dropout_rate
82
+ encoder/LayerStack.norm_epsilon = 1e-06
83
+ encoder/LayerStack.recompute_grads = False
84
+
85
+ # Parameters for make_bitransformer:
86
+ # ==============================================================================
87
+ make_bitransformer.decoder_name = 'decoder'
88
+ make_bitransformer.encoder_name = 'encoder'
89
+
90
+ # Parameters for decoder/make_layer_stack:
91
+ # ==============================================================================
92
+ decoder/make_layer_stack.block_scope = True
93
+ decoder/make_layer_stack.layers = \
94
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
95
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
96
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
97
+ decoder/make_layer_stack.num_layers = %num_layers
98
+
99
+ # Parameters for encoder/make_layer_stack:
100
+ # ==============================================================================
101
+ encoder/make_layer_stack.block_scope = True
102
+ encoder/make_layer_stack.layers = \
103
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
104
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
105
+ encoder/make_layer_stack.num_layers = %num_layers
106
+
107
+ # Parameters for maybe_print_dataset:
108
+ # ==============================================================================
109
+ maybe_print_dataset.should_print = False
110
+
111
+ # Parameters for mesh_train_dataset_fn:
112
+ # ==============================================================================
113
+ mesh_train_dataset_fn.use_cached = False
114
+
115
+ # Parameters for MtfModel:
116
+ # ==============================================================================
117
+ MtfModel.autostack = True
118
+ MtfModel.ensemble_inputs = None
119
+ MtfModel.gcp_project = None
120
+ MtfModel.layout_rules = \
121
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
122
+ MtfModel.mesh_devices = None
123
+ MtfModel.mesh_shape = None
124
+ MtfModel.model_type = 'bitransformer'
125
+ MtfModel.optimizer = None
126
+ MtfModel.predict_fn = None
127
+ MtfModel.tpu_job_name = None
128
+ MtfModel.tpu_zone = None
129
+ MtfModel.variable_filter = None
130
+
131
+ # Parameters for noise_token_to_sentinel:
132
+ # ==============================================================================
133
+ # None.
134
+
135
+ # Parameters for num_parallel_calls:
136
+ # ==============================================================================
137
+ num_parallel_calls.deterministic = False
138
+
139
+ # Parameters for pack_dataset:
140
+ # ==============================================================================
141
+ pack_dataset.use_custom_ops = False
142
+
143
+ # Parameters for pack_or_pad:
144
+ # ==============================================================================
145
+ # None.
146
+
147
+ # Parameters for decoder/SelfAttention:
148
+ # ==============================================================================
149
+ decoder/SelfAttention.attention_func = None
150
+ decoder/SelfAttention.attention_kwargs = None
151
+ decoder/SelfAttention.combine_dims = True
152
+ decoder/SelfAttention.dropout_rate = %dropout_rate
153
+ decoder/SelfAttention.keep_query_heads_dims = False
154
+ decoder/SelfAttention.key_value_size = %d_kv
155
+ decoder/SelfAttention.num_heads = %num_heads
156
+ decoder/SelfAttention.num_memory_heads = 0
157
+ decoder/SelfAttention.relative_attention_num_buckets = 32
158
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
159
+ decoder/SelfAttention.shared_kv = False
160
+
161
+ # Parameters for encoder/SelfAttention:
162
+ # ==============================================================================
163
+ encoder/SelfAttention.attention_func = None
164
+ encoder/SelfAttention.attention_kwargs = None
165
+ encoder/SelfAttention.combine_dims = True
166
+ encoder/SelfAttention.dropout_rate = %dropout_rate
167
+ encoder/SelfAttention.keep_query_heads_dims = False
168
+ encoder/SelfAttention.key_value_size = %d_kv
169
+ encoder/SelfAttention.num_heads = %num_heads
170
+ encoder/SelfAttention.num_memory_heads = 0
171
+ encoder/SelfAttention.relative_attention_num_buckets = 32
172
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
173
+ encoder/SelfAttention.shared_kv = False
174
+
175
+ # Parameters for SentencePieceVocabulary:
176
+ # ==============================================================================
177
+ # None.
178
+
179
+ # Parameters for sentinel_id:
180
+ # ==============================================================================
181
+ sentinel_id.return_value = None
182
+
183
+ # Parameters for serialize_num_microbatches:
184
+ # ==============================================================================
185
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
186
+
187
+ # Parameters for shift_targets:
188
+ # ==============================================================================
189
+ shift_targets.bos_id = 0
190
+ shift_targets.eos_id = 1
191
+
192
+ # Parameters for tpu_estimator_model_fn:
193
+ # ==============================================================================
194
+ tpu_estimator_model_fn.model_info_file = None
195
+ tpu_estimator_model_fn.outer_batch_size = 1
196
+ tpu_estimator_model_fn.tpu_summaries = False
197
+
198
+ # Parameters for tpu_mesh_shape:
199
+ # ==============================================================================
200
+ tpu_mesh_shape.ensemble_parallelism = None
201
+
202
+ # Parameters for decoder/Unitransformer:
203
+ # ==============================================================================
204
+ decoder/Unitransformer.d_model = %d_model
205
+ decoder/Unitransformer.ensemble = None
206
+ decoder/Unitransformer.input_full_attention = False
207
+ decoder/Unitransformer.label_smoothing = 0.0
208
+ decoder/Unitransformer.loss_denominator = 233472
209
+ decoder/Unitransformer.loss_fn = None
210
+ decoder/Unitransformer.loss_on_targets_only = False
211
+ decoder/Unitransformer.max_length = 512
212
+ decoder/Unitransformer.positional_embedding = False
213
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
214
+ decoder/Unitransformer.sinusoid_positional_embedding = False
215
+ decoder/Unitransformer.token_dropout_rate = 0.0
216
+ decoder/Unitransformer.vocab_divisor = 128
217
+ decoder/Unitransformer.z_loss = 0.0001
218
+
219
+ # Parameters for encoder/Unitransformer:
220
+ # ==============================================================================
221
+ encoder/Unitransformer.d_model = %d_model
222
+ encoder/Unitransformer.ensemble = None
223
+ encoder/Unitransformer.input_full_attention = False
224
+ encoder/Unitransformer.label_smoothing = 0.0
225
+ encoder/Unitransformer.loss_denominator = None
226
+ encoder/Unitransformer.loss_fn = None
227
+ encoder/Unitransformer.loss_on_targets_only = False
228
+ encoder/Unitransformer.max_length = 512
229
+ encoder/Unitransformer.positional_embedding = False
230
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
231
+ encoder/Unitransformer.sinusoid_positional_embedding = False
232
+ encoder/Unitransformer.token_dropout_rate = 0.0
233
+ encoder/Unitransformer.vocab_divisor = 128
234
+ encoder/Unitransformer.z_loss = 0.0001
235
+
236
+ # Parameters for VarianceScalingInitializer:
237
+ # ==============================================================================
238
+ VarianceScalingInitializer.distribution = 'normal'
239
+ VarianceScalingInitializer.mode = 'fan_in'
240
+ VarianceScalingInitializer.scale = 1.0
241
+
242
+ # Parameters for VocabEmbedding:
243
+ # ==============================================================================
244
+ # None.
245
+
246
+ # Parameters for Vocabulary:
247
+ # ==============================================================================
248
+ # None.