lm
Browse files- norwegian_lm_base.gin +21 -0
- tasks.py +27 -0
- train_norwegian_lm_base.sh +9 -0
norwegian_lm_base.gin
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
include 't5x/examples/t5/mt5/base.gin'
|
2 |
+
include 'pretrain_cont.gin'
|
3 |
+
#include 't5x/configs/runs/pretrain.gin'
|
4 |
+
#iinclude 't5x/configs/runs/finetune.gin'
|
5 |
+
|
6 |
+
|
7 |
+
# Register necessary SeqIO Tasks/Mixtures.
|
8 |
+
import t5.data.mixtures
|
9 |
+
import tasks
|
10 |
+
|
11 |
+
MIXTURE_OR_TASK_NAME = "ncc_english_prefix_lm_stream"
|
12 |
+
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
|
13 |
+
TRAIN_STEPS = 1_700_000
|
14 |
+
DROPOUT_RATE = 0.0 # Changed from the default since T5-1.1 recomments this.
|
15 |
+
INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
|
16 |
+
PjitPartitioner.num_partitions = 1
|
17 |
+
utils.SaveCheckpointConfig.period = 5000
|
18 |
+
utils.SaveCheckpointConfig.keep = 3
|
19 |
+
|
20 |
+
|
21 |
+
|
tasks.py
CHANGED
@@ -156,6 +156,33 @@ TaskRegistry.add(
|
|
156 |
metric_fns=[]
|
157 |
)
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
# Final pretraining task used in Raffel et al., 2019 adaptated to NCC
|
160 |
dataset_name = 'NbAiLab/scandinavian'
|
161 |
dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True}
|
|
|
156 |
metric_fns=[]
|
157 |
)
|
158 |
|
159 |
+
# Final pretraining task used in Raffel et al., 2019 adaptated to NCC
|
160 |
+
dataset_name = 'NbAiLab/NCC_plus_english'
|
161 |
+
dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True}
|
162 |
+
dataset_shapes = None
|
163 |
+
TaskRegistry.add(
|
164 |
+
"ncc_english_prefix_lm_stream",
|
165 |
+
source=seqio.FunctionDataSource(
|
166 |
+
dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params),
|
167 |
+
splits=("train", "validation"),
|
168 |
+
caching_permitted=False,
|
169 |
+
num_input_examples=dataset_shapes,
|
170 |
+
),
|
171 |
+
preprocessors=[
|
172 |
+
functools.partial(
|
173 |
+
target_to_key, key_map={
|
174 |
+
"inputs": None,
|
175 |
+
"targets": None,
|
176 |
+
}, target_key="targets"),
|
177 |
+
seqio.preprocessors.tokenize,
|
178 |
+
# seqio.CacheDatasetPlaceholder(),
|
179 |
+
preprocessors.prefix_lm,
|
180 |
+
seqio.preprocessors.append_eos_after_trim,
|
181 |
+
],
|
182 |
+
output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"]},
|
183 |
+
metric_fns=[]
|
184 |
+
)
|
185 |
+
|
186 |
# Final pretraining task used in Raffel et al., 2019 adaptated to NCC
|
187 |
dataset_name = 'NbAiLab/scandinavian'
|
188 |
dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True}
|
train_norwegian_lm_base.sh
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PROJECT_DIR=${HOME}"/models/pk-nb-t5x"
|
2 |
+
T5X_DIR="../../t5x" # directory where the t5x is cloned.
|
3 |
+
MODEL_DIR="gs://nb-t5x-us-central2/norwegian_NCC_plus_English_pluss200k_lm_t5x_base"
|
4 |
+
export PYTHONPATH=${PROJECT_DIR}
|
5 |
+
|
6 |
+
python3 ${T5X_DIR}/t5x/train.py \
|
7 |
+
--gin_search_paths=${PROJECT_DIR} \
|
8 |
+
--gin_file="norwegian_lm_base.gin" \
|
9 |
+
--gin.MODEL_DIR="'${MODEL_DIR}'" \
|