pere commited on
Commit
deb4aba
1 Parent(s): c73c577
Files changed (2) hide show
  1. batch_nynorsk_balanced_small_long.sh +12 -0
  2. tasks.py +23 -0
batch_nynorsk_balanced_small_long.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROJECT_DIR=${HOME}"/models/t5-nynorsk-oversetter"
2
+ export PYTHONPATH=${PROJECT_DIR}
3
+ INITIAL_CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/norwegian_NCC_plus_English_pluss200k_balanced_bokmaal_nynorsk_t5x_small/checkpoint_1700000\"
4
+ TRAIN_STEPS=1705000
5
+
6
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_small.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_long_v1\"
7
+
8
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_small.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_long_v2\" &&
9
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_small.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_long_v3\" &&
10
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_small.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_long_v4\" &&
11
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_small.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_long_v5\"
12
+
tasks.py CHANGED
@@ -25,6 +25,12 @@ tsv_translate_path = {
25
  "test": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/test.tsv"
26
  }
27
 
 
 
 
 
 
 
28
  tsv_sentiment_path = {
29
  "train": "gs://notram-public/finetune_datasets/norec_sentiment/train.tsv",
30
  "validation": "gs://notram-public/finetune_datasets/norec_sentiment/dev.tsv",
@@ -182,3 +188,20 @@ seqio.TaskRegistry.add(
182
  output_features=DEFAULT_OUTPUT_FEATURES,
183
  )
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  "test": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/test.tsv"
26
  }
27
 
28
+ tsv_translate_long_path = {
29
+ "train": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/train_long.tsv",
30
+ "validation": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/dev_long.tsv",
31
+ "test": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/test_long.tsv"
32
+ }
33
+
34
  tsv_sentiment_path = {
35
  "train": "gs://notram-public/finetune_datasets/norec_sentiment/train.tsv",
36
  "validation": "gs://notram-public/finetune_datasets/norec_sentiment/dev.tsv",
 
188
  output_features=DEFAULT_OUTPUT_FEATURES,
189
  )
190
 
191
+ seqio.TaskRegistry.add(
192
+ "translate_long",
193
+ source=seqio.TextLineDataSource(
194
+ split_to_filepattern=tsv_translate_long_path,
195
+ #num_input_examples=num_nq_examples
196
+ ),
197
+ preprocessors=[
198
+ functools.partial(
199
+ t5.data.preprocessors.parse_tsv,
200
+ field_names=["source","target"]),
201
+ categorise_preprocessor,
202
+ seqio.preprocessors.tokenize_and_append_eos,
203
+ ],
204
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
205
+ output_features=DEFAULT_OUTPUT_FEATURES,
206
+ )
207
+