yhavinga commited on
Commit
ab79d30
1 Parent(s): 8b38a7a
config.gin ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import __main__ as train_script
3
+ import seqio
4
+ import t5.data.mixtures
5
+ from t5x import adafactor
6
+ from t5x.examples.t5 import network
7
+ from t5x import gin_utils
8
+ from t5x import models
9
+ from t5x import partitioning
10
+ from t5x import trainer
11
+ from t5x import utils
12
+ import tasks.nedd_tasks
13
+
14
+ # Macros:
15
+ # ==============================================================================
16
+ BATCH_SIZE = 128
17
+ DROPOUT_RATE = 0.0
18
+ LABEL_SMOOTHING = 0.0
19
+ LOSS_NORMALIZING_FACTOR = None
20
+ MIXTURE_OR_TASK_MODULE = None
21
+ MIXTURE_OR_TASK_NAME = 'mc4_nedd_wiki_news_mix_1'
22
+ MODEL = @models.EncoderDecoderModel()
23
+ MODEL_DIR = 't5_1_1_base_nl36_mc4_nedd_wiki_news_nl'
24
+ OPTIMIZER = @adafactor.Adafactor()
25
+ RANDOM_SEED = None
26
+ SHUFFLE_TRAIN_EXAMPLES = True
27
+ TASK_FEATURE_LENGTHS = {'inputs': 512, 'targets': 114}
28
+ TRAIN_STEPS = 1000000
29
+ USE_CACHED_TASKS = False
30
+ USE_HARDWARE_RNG = False
31
+ VOCABULARY = @seqio.SentencePieceVocabulary()
32
+ Z_LOSS = 0.0001
33
+
34
+ # Parameters for adafactor.Adafactor:
35
+ # ==============================================================================
36
+ adafactor.Adafactor.decay_rate = 0.8
37
+ adafactor.Adafactor.logical_factor_rules = \
38
+ @adafactor.standard_logical_factor_rules()
39
+ adafactor.Adafactor.step_offset = 0
40
+
41
+ # Parameters for utils.CheckpointConfig:
42
+ # ==============================================================================
43
+ utils.CheckpointConfig.restore = @utils.RestoreCheckpointConfig()
44
+ utils.CheckpointConfig.save = @utils.SaveCheckpointConfig()
45
+
46
+ # Parameters for utils.create_learning_rate_scheduler:
47
+ # ==============================================================================
48
+ utils.create_learning_rate_scheduler.base_learning_rate = 1.0
49
+ utils.create_learning_rate_scheduler.factors = 'constant * rsqrt_decay'
50
+ utils.create_learning_rate_scheduler.warmup_steps = 10000
51
+
52
+ # Parameters for train/utils.DatasetConfig:
53
+ # ==============================================================================
54
+ train/utils.DatasetConfig.batch_size = %BATCH_SIZE
55
+ train/utils.DatasetConfig.mixture_or_task_name = %MIXTURE_OR_TASK_NAME
56
+ train/utils.DatasetConfig.module = %MIXTURE_OR_TASK_MODULE
57
+ train/utils.DatasetConfig.pack = True
58
+ train/utils.DatasetConfig.seed = None
59
+ train/utils.DatasetConfig.shuffle = %SHUFFLE_TRAIN_EXAMPLES
60
+ train/utils.DatasetConfig.split = 'train'
61
+ train/utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS
62
+ train/utils.DatasetConfig.use_cached = %USE_CACHED_TASKS
63
+
64
+ # Parameters for train_eval/utils.DatasetConfig:
65
+ # ==============================================================================
66
+ train_eval/utils.DatasetConfig.batch_size = %BATCH_SIZE
67
+ train_eval/utils.DatasetConfig.mixture_or_task_name = %MIXTURE_OR_TASK_NAME
68
+ train_eval/utils.DatasetConfig.module = %MIXTURE_OR_TASK_MODULE
69
+ train_eval/utils.DatasetConfig.pack = True
70
+ train_eval/utils.DatasetConfig.seed = 42
71
+ train_eval/utils.DatasetConfig.shuffle = False
72
+ train_eval/utils.DatasetConfig.split = 'validation'
73
+ train_eval/utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS
74
+ train_eval/utils.DatasetConfig.use_cached = %USE_CACHED_TASKS
75
+
76
+ # Parameters for models.EncoderDecoderModel:
77
+ # ==============================================================================
78
+ models.EncoderDecoderModel.input_vocabulary = %VOCABULARY
79
+ models.EncoderDecoderModel.label_smoothing = %LABEL_SMOOTHING
80
+ models.EncoderDecoderModel.loss_normalizing_factor = %LOSS_NORMALIZING_FACTOR
81
+ models.EncoderDecoderModel.module = @network.Transformer()
82
+ models.EncoderDecoderModel.optimizer_def = %OPTIMIZER
83
+ models.EncoderDecoderModel.output_vocabulary = %VOCABULARY
84
+ models.EncoderDecoderModel.z_loss = %Z_LOSS
85
+
86
+ # Parameters for partitioning.PjitPartitioner:
87
+ # ==============================================================================
88
+ partitioning.PjitPartitioner.logical_axis_rules = \
89
+ @partitioning.standard_logical_axis_rules()
90
+ partitioning.PjitPartitioner.model_parallel_submesh = None
91
+ partitioning.PjitPartitioner.num_partitions = 1
92
+
93
+ # Parameters for utils.RestoreCheckpointConfig:
94
+ # ==============================================================================
95
+ utils.RestoreCheckpointConfig.path = []
96
+
97
+ # Parameters for utils.SaveCheckpointConfig:
98
+ # ==============================================================================
99
+ utils.SaveCheckpointConfig.dtype = 'float32'
100
+ utils.SaveCheckpointConfig.keep = 4
101
+ utils.SaveCheckpointConfig.period = 50000
102
+ utils.SaveCheckpointConfig.save_dataset = False
103
+ utils.SaveCheckpointConfig.use_gda = False
104
+
105
+ # Parameters for seqio.SentencePieceVocabulary:
106
+ # ==============================================================================
107
+ seqio.SentencePieceVocabulary.sentencepiece_model_file = \
108
+ 'gs://t5-dutch-english/vocabs/nedd.32000.100extra/spiece.model'
109
+
110
+ # Parameters for network.T5Config:
111
+ # ==============================================================================
112
+ network.T5Config.dropout_rate = %DROPOUT_RATE
113
+ network.T5Config.dtype = 'bfloat16'
114
+ network.T5Config.emb_dim = 768
115
+ network.T5Config.head_dim = 64
116
+ network.T5Config.logits_via_embedding = False
117
+ network.T5Config.mlp_activations = ('gelu', 'linear')
118
+ network.T5Config.mlp_dim = 3072
119
+ network.T5Config.num_decoder_layers = 36
120
+ network.T5Config.num_encoder_layers = 36
121
+ network.T5Config.num_heads = 12
122
+ network.T5Config.vocab_size = 32128
123
+
124
+ # Parameters for train_script.train:
125
+ # ==============================================================================
126
+ train_script.train.checkpoint_cfg = @utils.CheckpointConfig()
127
+ train_script.train.eval_period = 2000
128
+ train_script.train.eval_steps = 20
129
+ train_script.train.infer_eval_dataset_cfg = None
130
+ train_script.train.model = %MODEL
131
+ train_script.train.model_dir = %MODEL_DIR
132
+ train_script.train.partitioner = @partitioning.PjitPartitioner()
133
+ train_script.train.random_seed = %RANDOM_SEED
134
+ train_script.train.stats_period = 100
135
+ train_script.train.summarize_config_fn = @gin_utils.summarize_gin_config
136
+ train_script.train.total_steps = %TRAIN_STEPS
137
+ train_script.train.train_dataset_cfg = @train/utils.DatasetConfig()
138
+ train_script.train.train_eval_dataset_cfg = @train_eval/utils.DatasetConfig()
139
+ train_script.train.trainer_cls = @trainer.Trainer
140
+ train_script.train.use_hardware_rng = %USE_HARDWARE_RNG
141
+
142
+ # Parameters for trainer.Trainer:
143
+ # ==============================================================================
144
+ trainer.Trainer.learning_rate_fn = @utils.create_learning_rate_scheduler()
145
+ trainer.Trainer.num_microbatches = None
146
+
147
+ # Parameters for network.Transformer:
148
+ # ==============================================================================
149
+ network.Transformer.config = @network.T5Config()
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "hf/t5_1_1-base-nl36-dutch",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 3072,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "num_decoder_layers": 36,
20
+ "num_heads": 12,
21
+ "num_layers": 36,
22
+ "output_past": true,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.24.0",
29
+ "use_cache": true,
30
+ "vocab_size": 32128
31
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca1339806e388332eb5ce9036e9bd11f934f0605ea493a3c5347bc3c83b0def3
3
+ size 1677466902
model-info.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15c1631d525672979aa4e35f3fdc4af5780c56f8ffad2aa42ffb9190330ee3f8
3
+ size 3255881749
special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caa6e2f21aeec181276ab80273e3f869ce303ccb8602d68e0524783c3581092d
3
+ size 800223
spiece.vocab ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "extra_ids": 100,
106
+ "name_or_path": "yhavinga/ul2-base-en-nl",
107
+ "pad_token": "<pad>",
108
+ "sp_model_kwargs": {},
109
+ "special_tokens_map_file": null,
110
+ "tokenizer_class": "T5Tokenizer",
111
+ "unk_token": "<unk>",
112
+ "use_fast_tokenizer": false
113
+ }
train/events.out.tfevents.1670764801.t1v-n-82e561a1-w-0.1168530.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3227e48b338fa4e4361512b196dc44f0f72dab6eda1baae5114670c7feb9f5cb
3
+ size 7948873
train/events.out.tfevents.1671135229.t1v-n-82e561a1-w-0.1383256.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9617b7c1ccc9cfd1b9525c23a57546f40f549249f405592f487aa4f61c71a92a
3
+ size 274265
train/events.out.tfevents.1671186857.t1v-n-7fd9879f-w-0.50950.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ec26657ab86b74d0b8d77541ab44ccb6219625edde502711195a2a792b41e87
3
+ size 11923175
training_eval/mc4_nedd_wiki_news_mix_1/events.out.tfevents.1670764801.t1v-n-82e561a1-w-0.1168530.2.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06c17479056cba2bc1a97dc5a65ee9cfb75f84e67bf0aed4fa08871d36261f99
3
+ size 351757
training_eval/mc4_nedd_wiki_news_mix_1/events.out.tfevents.1671135229.t1v-n-82e561a1-w-0.1383256.2.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efc3e139b311b16c96db3591316053ffdfec7b904656823c59dc0089e0cbf99b
3
+ size 10680
training_eval/mc4_nedd_wiki_news_mix_1/events.out.tfevents.1671186857.t1v-n-7fd9879f-w-0.50950.2.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:620c5fb96a01bdfb5df0833908fb7dc5ed132e2290bc75e28669fb1ec2f134bc
3
+ size 527826
training_eval/mc4_nl_span_corruption/events.out.tfevents.1670764801.t1v-n-82e561a1-w-0.1168530.1.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a72fb5a79ba2e902beb1263bad445a1fa0715a0ba61a48725017030e3376114
3
+ size 351757
training_eval/mc4_nl_span_corruption/events.out.tfevents.1671135229.t1v-n-82e561a1-w-0.1383256.1.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e0986b4fba5b5f4e592af08fc61c904572ec415c4f74caf9e8897581b168c29
3
+ size 10680
training_eval/mc4_nl_span_corruption/events.out.tfevents.1671186857.t1v-n-7fd9879f-w-0.50950.1.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89a2c296e18254dbef6016c3a3844571073c40acbb554e0a194012a215dda2a9
3
+ size 527826