pere commited on
Commit
e3afbe6
1 Parent(s): 43ba9a0
Files changed (3) hide show
  1. norwegian_lm_base.gin +21 -0
  2. tasks.py +27 -0
  3. train_norwegian_lm_base.sh +9 -0
norwegian_lm_base.gin ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include 't5x/examples/t5/mt5/base.gin'
2
+ include 'pretrain_cont.gin'
3
+ #include 't5x/configs/runs/pretrain.gin'
4
+ #iinclude 't5x/configs/runs/finetune.gin'
5
+
6
+
7
+ # Register necessary SeqIO Tasks/Mixtures.
8
+ import t5.data.mixtures
9
+ import tasks
10
+
11
+ MIXTURE_OR_TASK_NAME = "ncc_english_prefix_lm_stream"
12
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
13
+ TRAIN_STEPS = 1_700_000
14
+ DROPOUT_RATE = 0.0 # Changed from the default since T5-1.1 recomments this.
15
+ INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
16
+ PjitPartitioner.num_partitions = 1
17
+ utils.SaveCheckpointConfig.period = 5000
18
+ utils.SaveCheckpointConfig.keep = 3
19
+
20
+
21
+
tasks.py CHANGED
@@ -156,6 +156,33 @@ TaskRegistry.add(
156
  metric_fns=[]
157
  )
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  # Final pretraining task used in Raffel et al., 2019 adaptated to NCC
160
  dataset_name = 'NbAiLab/scandinavian'
161
  dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True}
 
156
  metric_fns=[]
157
  )
158
 
159
+ # Final pretraining task used in Raffel et al., 2019 adaptated to NCC
160
+ dataset_name = 'NbAiLab/NCC_plus_english'
161
+ dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True}
162
+ dataset_shapes = None
163
+ TaskRegistry.add(
164
+ "ncc_english_prefix_lm_stream",
165
+ source=seqio.FunctionDataSource(
166
+ dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params),
167
+ splits=("train", "validation"),
168
+ caching_permitted=False,
169
+ num_input_examples=dataset_shapes,
170
+ ),
171
+ preprocessors=[
172
+ functools.partial(
173
+ target_to_key, key_map={
174
+ "inputs": None,
175
+ "targets": None,
176
+ }, target_key="targets"),
177
+ seqio.preprocessors.tokenize,
178
+ # seqio.CacheDatasetPlaceholder(),
179
+ preprocessors.prefix_lm,
180
+ seqio.preprocessors.append_eos_after_trim,
181
+ ],
182
+ output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"]},
183
+ metric_fns=[]
184
+ )
185
+
186
  # Final pretraining task used in Raffel et al., 2019 adaptated to NCC
187
  dataset_name = 'NbAiLab/scandinavian'
188
  dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True}
train_norwegian_lm_base.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ PROJECT_DIR=${HOME}"/models/pk-nb-t5x"
2
+ T5X_DIR="../../t5x" # directory where the t5x is cloned.
3
+ MODEL_DIR="gs://nb-t5x-us-central2/norwegian_NCC_plus_English_pluss200k_lm_t5x_base"
4
+ export PYTHONPATH=${PROJECT_DIR}
5
+
6
+ python3 ${T5X_DIR}/t5x/train.py \
7
+ --gin_search_paths=${PROJECT_DIR} \
8
+ --gin_file="norwegian_lm_base.gin" \
9
+ --gin.MODEL_DIR="'${MODEL_DIR}'" \