pere commited on
Commit
bcc84fc
1 Parent(s): 14706c3
README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ Evaluate nynorsk translateion
__pycache__/my_metrics.cpython-38.pyc ADDED
Binary file (455 Bytes). View file
 
__pycache__/tasks.cpython-38.pyc ADDED
Binary file (4.36 kB). View file
 
batch_nynorsk_NCC_base.sh ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ PROJECT_DIR=${HOME}"/models/t5-nynorsk-evaluator"
3
+ export PYTHONPATH=${PROJECT_DIR}
4
+ #INITIAL_CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000\"
5
+ #TRAIN_STEPS=1505000
6
+
7
+
8
+ FINETUNE_STEPS=5000
9
+ MODEL_BUCKET_DIR="gs://nb-t5x-us-central2/finetuned/"
10
+ EVAL_PREFIX="ul2test/eval_nynorsk_"
11
+ CHECKPOINT_BUCKET_DIR="gs://nb-t5x-us-central2/"
12
+ CHECKPOINT_LIST=("exp1-t5-base-ul2-engvoc" "exp2-t5-base-ul2-scandvoc" "exp3-t5-base-span-engvoc" "exp4-t5-base-span-scandvoc" "exp5-t5-base-ul2-scandvoc-full" "exp6-t5-base-span-scandvoc-full" "exp7-t5-base-ul2-511-scandvoc" "exp8-t5-base-span-511-scandvoc" "exp9-t5-base-ul2-mt5voc" "exp10-t5-base-span-mt5voc" "exp11-t5-base-ul2-511-scandvoc-full" "exp12-t5-base-span-511-scandvoc-full" "exp13-t5-base-ul2-mt5voc-full" "exp14-t5-base-span-mt5voc-full" "exp15-t5-base-ul2-511-scandvoc-full-scratch" "exp16-t5-base-span-511-scandvoc-full-scratch" "exp17-t5-small-ul2-mt5voc-scratch" "exp18-t5-small-span-mt5voc-scratch" "exp19-t5-small-ul2-mt5voc" "exp20-t5-small-span-mt5voc" "exp21-t5-small-ul2-mt5voc-full" "exp22-t5-small-span-mt5voc-full")
13
+
14
+
15
+ PRETUNE_START_LIST=(100000 200000 300000 400000 500000 1000000 1100000 1184000 1200000 1204000 1284000 1300000 1384000 1400000 1484000 1500000)
16
+ VERSION_LIST=("1" "2" "3" "4" "5")
17
+
18
+ index=$(($1 + 0))
19
+
20
+ if [ $index -lt 1 ] || [ $index -gt ${#CHECKPOINT_LIST[@]} ] || [ $# -ne 1 ]; then
21
+ echo "Error: You need to provide the number of the checkpoints below as a parameter."
22
+ for i in "${!CHECKPOINT_LIST[@]}"; do
23
+ echo "$((i+1)). ${CHECKPOINT_LIST[i]}"
24
+ done
25
+
26
+ exit 1
27
+ fi
28
+
29
+
30
+
31
+ for v in "${VERSION_LIST[@]}"; do
32
+ for s in "${PRETUNE_START_LIST[@]}"; do
33
+ INITIAL_CHECKPOINT_PATH="${CHECKPOINT_BUCKET_DIR}${CHECKPOINT_LIST[$((index-1))]}/checkpoint_${s}"
34
+ TRAIN_STEPS=$((s+FINETUNE_STEPS))
35
+ if [[ "$INITIAL_CHECKPOINT_PATH" == *"engvoc"* ]]; then
36
+ GIN_FILE="finetune_translate_base.gin"
37
+ MIXTURE_OR_TASK_NAME="translate_long"
38
+
39
+ elif [[ "$INITIAL_CHECKPOINT_PATH" == *"scandvoc"* ]]; then
40
+ GIN_FILE="finetune_translate_base_scand.gin"
41
+ MIXTURE_OR_TASK_NAME="translate_long_scand"
42
+ else
43
+ if [[ "$INITIAL_CHECKPOINT_PATH" == *"small"* ]]; then
44
+ GIN_FILE="finetune_translate_small_mt5.gin"
45
+ else
46
+ GIN_FILE="finetune_translate_base_mt5.gin"
47
+ fi
48
+ MIXTURE_OR_TASK_NAME="translate_long_mt5"
49
+ fi
50
+
51
+ MODEL_DIR="${MODEL_BUCKET_DIR}${EVAL_PREFIX}v${v}_${CHECKPOINT_LIST[$((index-1))]}_${s}"
52
+ command="python3 ../../t5x/t5x/train.py --gin_search_paths=\"./\" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file=${GIN_FILE} --gin.INITIAL_CHECKPOINT_PATH=\\\"${INITIAL_CHECKPOINT_PATH}\\\" --gin.MIXTURE_OR_TASK_NAME=\\\"${MIXTURE_OR_TASK_NAME}\\\" --gin.MODEL_DIR=\\\"${MODEL_DIR}\\\""
53
+ echo "${command}"
54
+ # Uncomment the next line to run the command:
55
+ eval "${command}"
56
+ done
57
+ done
58
+
59
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v1\" &&
60
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v2\" &&
61
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v3\" &&
62
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v4\" &&
63
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v5\"
64
+
batch_various.sh ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ PROJECT_DIR=${HOME}"/models/t5-nynorsk-norbench"
3
+ export PYTHONPATH=${PROJECT_DIR}
4
+
5
+ FINETUNE_STEPS=5000
6
+ EVAL_PREFIX="norbench/eval_translate_"
7
+ MODEL_BUCKET_DIR="gs://pere-north-t5x/finetuned/"
8
+
9
+ CHECKPOINT_LIST=(
10
+ "pere-north-t5x/pretrained_models/small/norwegian_NCC_plus_English_t5x_small/checkpoint_1500000"
11
+ "pere-north-t5x/pretrained_models/base/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
12
+ "pere-north-t5x/pretrained_models/large/norwegian_NCC_plus_English_t5x_large/checkpoint_1500000"
13
+ "t5-data/pretrained_models/t5x/mt5_small/checkpoint_1000000"
14
+ "t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
15
+ "t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
16
+ )
17
+
18
+ NAME_LIST=(
19
+ "north_t5_small_NCC"
20
+ "north_t5_base_NCC"
21
+ "north_t5_large_NCC"
22
+ "mt5_small"
23
+ "mt5_base"
24
+ "mt5_large"
25
+ )
26
+
27
+ TASK_LIST=("translate_long_mt5" "translate_long_mt5" "translate_long_mt5" "translate_long_mt5" "translate_long_mt5" "translate_long_mt5")
28
+
29
+ GIN_LIST=(
30
+ "finetune_translate_small_mt5.gin"
31
+ "finetune_translate_base_mt5.gin"
32
+ "finetune_translate_large_mt5.gin"
33
+ "finetune_translate_small_mt5.gin"
34
+ "finetune_translate_base_mt5.gin"
35
+ "finetune_translate_large_mt5.gin"
36
+ )
37
+
38
+ START_LIST=(1000000 1000000 1000000 1500000 1500000 1500000)
39
+ EXP_LIST=(115 116 117 118 119 120)
40
+
41
+
42
+ VERSION_LIST=("1" "2" "3" "4" "5")
43
+
44
+ index=$(($1 + 0))
45
+
46
+ if [ $index -lt 1 ] || [ $index -gt ${#CHECKPOINT_LIST[@]} ] || [ $# -ne 1 ]; then
47
+ echo "Error: You need to provide the number of the checkpoints below as a parameter."
48
+ for i in "${!CHECKPOINT_LIST[@]}"; do
49
+ echo "$((i+1)). ${CHECKPOINT_LIST[i]}"
50
+ done
51
+
52
+ exit 1
53
+ fi
54
+
55
+ for v in "${VERSION_LIST[@]}"; do
56
+ i=($index-1)
57
+ INITIAL_CHECKPOINT_PATH="gs://${CHECKPOINT_LIST[i]}"
58
+
59
+ TRAIN_STEPS=$((START_LIST[i]+FINETUNE_STEPS))
60
+ GIN_FILE=${GIN_LIST[i]}
61
+ MIXTURE_OR_TASK_NAME=${TASK_LIST[i]}
62
+ MODEL_DIR="${MODEL_BUCKET_DIR}${EVAL_PREFIX}exp${EXP_LIST[i]}_${NAME_LIST[i]}_v${v}"
63
+
64
+ command="python3 ../../t5x/t5x/train.py --gin_search_paths=\"./\" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file=${GIN_FILE} --gin.INITIAL_CHECKPOINT_PATH=\\\"${INITIAL_CHECKPOINT_PATH}\\\" --gin.MIXTURE_OR_TASK_NAME=\\\"${MIXTURE_OR_TASK_NAME}\\\" --gin.MODEL_DIR=\\\"${MODEL_DIR}\\\""
65
+ echo "${command}"
66
+ # Uncomment the next line to run the command:
67
+ eval "${command}"
68
+ done
69
+
70
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v1\" &&
71
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v2\" &&
72
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v3\" &&
73
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v4\" &&
74
+ #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v5\"
75
+
finetune_translate_base.gin ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+
4
+ import __main__ as train_script
5
+ from t5.data import mixtures
6
+ from t5x import models
7
+ from t5x import partitioning
8
+ from t5x import utils
9
+
10
+ include "t5x/examples/t5/t5_1_1/base.gin"
11
+ include "t5x/configs/runs/finetune.gin"
12
+
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
+ USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
+ RANDOM_SEED = 0
20
+
21
+ #Fixing a small error
22
+ infer_eval/utils.DatasetConfig:
23
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
24
+
25
+ #Saving every 1000 steps
26
+ utils.SaveCheckpointConfig:
27
+ period = 1000
28
+ keep = 1 # number of checkpoints to keep
29
+
30
+ # Might have to ba changed based on architecture
31
+ # partitioning.PjitPartitioner.num_partitions = 1
32
+
33
+
finetune_translate_base_mt5.gin ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+ import seqio
4
+
5
+ import __main__ as train_script
6
+ from t5.data import mixtures
7
+ from t5x import models
8
+ from t5x import partitioning
9
+ from t5x import utils
10
+
11
+ include 't5x/examples/t5/mt5/base.gin'
12
+ include "t5x/configs/runs/finetune.gin"
13
+
14
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
15
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
16
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
17
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
18
+ USE_CACHED_TASKS = False
19
+ DROPOUT_RATE = 0.1
20
+ RANDOM_SEED = 0
21
+
22
+ #Fixing a small error
23
+ infer_eval/utils.DatasetConfig:
24
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
25
+
26
+ #Saving every 1000 steps
27
+ utils.SaveCheckpointConfig:
28
+ period = 1000
29
+ keep = 1 # number of checkpoints to keep
30
+
31
+ # Might have to ba changed based on architecture
32
+ # partitioning.PjitPartitioner.num_partitions = 1
33
+
34
+
finetune_translate_base_scand.gin ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+ import seqio
4
+
5
+ import __main__ as train_script
6
+ from t5.data import mixtures
7
+ from t5x import models
8
+ from t5x import partitioning
9
+ from t5x import utils
10
+
11
+ include "t5x/examples/t5/t5_1_1/base.gin"
12
+ include "t5x/configs/runs/finetune.gin"
13
+
14
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
15
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
16
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
17
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
18
+ USE_CACHED_TASKS = False
19
+ DROPOUT_RATE = 0.1
20
+ RANDOM_SEED = 0
21
+
22
+ #Fixing a small error
23
+ infer_eval/utils.DatasetConfig:
24
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
25
+
26
+ #Saving every 1000 steps
27
+ utils.SaveCheckpointConfig:
28
+ period = 1000
29
+ keep = 1 # number of checkpoints to keep
30
+
31
+ # Might have to ba changed based on architecture
32
+ # partitioning.PjitPartitioner.num_partitions = 1
33
+
34
+ VOCABULARY = @seqio.SentencePieceVocabulary()
35
+ seqio.SentencePieceVocabulary.sentencepiece_model_file = "gs://nb-t5/t5/vocabs/wikipedia/no-da-en-sv-nn-is_32000_unigram.sp.model"
36
+ seqio.SentencePieceVocabulary.extra_ids = 100
37
+
finetune_translate_large.gin ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+
4
+ import __main__ as train_script
5
+ from t5.data import mixtures
6
+ from t5x import models
7
+ from t5x import partitioning
8
+ from t5x import utils
9
+
10
+ include "t5x/examples/t5/t5_1_1/large.gin"
11
+ include "t5x/configs/runs/finetune.gin"
12
+
13
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
15
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
16
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
17
+ USE_CACHED_TASKS = False
18
+ DROPOUT_RATE = 0.1
19
+ RANDOM_SEED = 0
20
+
21
+ #Fixing a small error
22
+ infer_eval/utils.DatasetConfig:
23
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
24
+
25
+ #Saving every 1000 steps
26
+ utils.SaveCheckpointConfig:
27
+ period = 1000
28
+ keep = 1 # number of checkpoints to keep
29
+
30
+ # Might have to ba changed based on architecture
31
+ # partitioning.PjitPartitioner.num_partitions = 1
32
+
33
+
finetune_translate_large_mt5.gin ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+ import seqio
4
+
5
+ import __main__ as train_script
6
+ from t5.data import mixtures
7
+ from t5x import models
8
+ from t5x import partitioning
9
+ from t5x import utils
10
+
11
+ include 't5x/examples/t5/mt5/large.gin'
12
+ include "t5x/configs/runs/finetune.gin"
13
+
14
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
15
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
16
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
17
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
18
+ USE_CACHED_TASKS = False
19
+ DROPOUT_RATE = 0.1
20
+ RANDOM_SEED = 0
21
+
22
+ #Fixing a small error
23
+ infer_eval/utils.DatasetConfig:
24
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
25
+
26
+ #Saving every 1000 steps
27
+ utils.SaveCheckpointConfig:
28
+ period = 1000
29
+ keep = 1 # number of checkpoints to keep
30
+
31
+ # Might have to ba changed based on architecture
32
+ # partitioning.PjitPartitioner.num_partitions = 1
33
+
34
+
finetune_translate_small_mt5.gin ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import tasks
3
+ import seqio
4
+
5
+ import __main__ as train_script
6
+ from t5.data import mixtures
7
+ from t5x import models
8
+ from t5x import partitioning
9
+ from t5x import utils
10
+
11
+ include 't5x/examples/t5/mt5/small.gin'
12
+ include "t5x/configs/runs/finetune.gin"
13
+
14
+ MIXTURE_OR_TASK_NAME = %gin.REQUIRED
15
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
16
+ INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
17
+ TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
18
+ USE_CACHED_TASKS = False
19
+ DROPOUT_RATE = 0.1
20
+ RANDOM_SEED = 0
21
+
22
+ #Fixing a small error
23
+ infer_eval/utils.DatasetConfig:
24
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
25
+
26
+ #Saving every 1000 steps
27
+ utils.SaveCheckpointConfig:
28
+ period = 1000
29
+ keep = 1 # number of checkpoints to keep
30
+
31
+ # Might have to ba changed based on architecture
32
+ # partitioning.PjitPartitioner.num_partitions = 1
33
+
34
+
gdrive ADDED
File without changes
generate_stats.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google.cloud import storage
2
+ import pandas as pd
3
+ import json
4
+ import re
5
+ import sys
6
+
7
+ # Create a storage client
8
+ client = storage.Client()
9
+
10
+ # Get the bucket
11
+ bucket_name = "nb-t5x-us-central2"
12
+ bucket = client.bucket(bucket_name)
13
+
14
+
15
+ #checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"]
16
+
17
+ checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"]
18
+
19
+ start=["100000","200000","300000","400000","500000","1000000","1100000","1184000","1200000","1204000","1284000","1300000","1384000","1400000","1484000","1500000"]
20
+
21
+ iterations=["1","2","3","4","5"]
22
+ file_names=[]
23
+
24
+ for i in iterations:
25
+ for c in checkpoints:
26
+ for s in start:
27
+ if "scand" in c:
28
+ name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long_scand-metrics.jsonl'
29
+ elif "mt5" in c:
30
+ name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long_mt5-metrics.jsonl'
31
+ else:
32
+ name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long-metrics.jsonl'
33
+ file_names.append(name)
34
+
35
+
36
+ #list to store json files data
37
+ file_contents = []
38
+
39
+ downloaded = 0
40
+ not_downloaded = 0
41
+
42
+ #print(file_names)
43
+ #print(bucket)
44
+ #sys.exit(-1)
45
+
46
+ #iterate over the files
47
+ for file_name in file_names:
48
+ # Get the file
49
+ blob = bucket.get_blob(file_name)
50
+ print(f'gs://{bucket_name}/{file_name}')
51
+
52
+ if not blob:
53
+ #print(f"Unable to download {file_name}")
54
+ not_downloaded+=1
55
+ continue
56
+ else:
57
+ downloaded+=1
58
+
59
+ content = blob.download_as_string().decode("utf-8")
60
+ # Split the content by newline
61
+ lines = content.split("\n")
62
+
63
+ #iterate over the lines
64
+ for n,line in enumerate(lines):
65
+ if not line:
66
+ continue
67
+ #print(line)
68
+ #print(file_name)
69
+ data = json.loads(line)
70
+ data['base_file_name'] = file_name
71
+ pretraining_steps = re.search(r"(voc_|voc-full_|voc-full-scratch_|voc-scratch_)(.*?)(?=/)", file_name).group(2)
72
+ data['pretraining_steps'] = int(pretraining_steps)
73
+ data['finetuning_steps'] = data['step'] - int(pretraining_steps)
74
+ data['vocab'] = re.search(r"-(\w+?)voc", file_name).group(1)
75
+ data['experiment'] = re.search(r"_exp(\w+?)-", file_name).group(1)
76
+ data['version'] = re.search(r"_v(\w+?)_exp", file_name).group(1)
77
+ data['experiment_name'] = re.search(r"exp\d+-(.*?)_", file_name).group(1)
78
+ file_contents.append(data)
79
+
80
+ print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded")
81
+
82
+ df = pd.json_normalize(file_contents)
83
+ only_5000 = df[df["finetuning_steps"] == 5000]
84
+ grouped = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro", "bleu"]].groupby(["experiment","experiment_name","pretraining_steps"])
85
+ average_at_5000 = grouped.mean().reset_index()
86
+ average_at_5000 = average_at_5000.assign(num_experiments=grouped.size().values)
87
+
88
+ only_3000 = df[df["finetuning_steps"] == 3000]
89
+ grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro", "bleu"]].groupby(["experiment","experiment_name","pretraining_steps"])
90
+ average_at_3000 = grouped.mean().reset_index()
91
+ average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values)
92
+
93
+ #print(average_at_3000.to_string(index=False))
94
+ print(average_at_5000.to_string(index=False))
95
+
96
+ print("\nNot complete:")
97
+ uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5]
98
+ print(uncomplete)
99
+
100
+
101
+ df.to_json("stats/all.jsonl", orient="records", lines=True)
102
+ df.to_csv("stats/all.csv", index=False)
103
+
104
+ only_5000.to_json("stats/only_5000.jsonl", orient="records", lines=True)
105
+ only_5000.to_csv("stats/only_5000.csv", index=False)
106
+
107
+ average_at_5000.to_json("stats/average_at_5000.jsonl", orient="records", lines=True)
108
+ average_at_5000.to_csv("stats/average_at_5000.csv", index=False)
109
+
110
+
111
+ print(f"Files exported to stats")
112
+
generate_stats_various.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google.cloud import storage
2
+ import pandas as pd
3
+ import json
4
+ import re
5
+ import sys
6
+
7
+ # Create a storage client
8
+ client = storage.Client()
9
+
10
+ # Get the bucket
11
+ bucket_name = "nb-t5x-us-central2"
12
+ bucket = client.bucket(bucket_name)
13
+
14
+
15
+ checkpoints=["exp115_mt5_small","exp116_north_t5_base_NCC","exp117_north_t5_base_NCC_lm","exp118_north_t5_base_scand3M","exp119_mt5_base","exp120_sab_base_2","exp121_sab_base_3","exp122_sab_base_4"]
16
+
17
+
18
+ start=["1500000","1600000","2000000","3000000","4000000"]
19
+
20
+ iterations=["1","2","3","4","5"]
21
+ file_names=[]
22
+
23
+ for i in iterations:
24
+ for c in checkpoints:
25
+ for s in start:
26
+ if "scand" in c:
27
+ name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full_scand-metrics.jsonl'
28
+ elif ("mt5" in c) or ("north" in c):
29
+ name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full_mt5-metrics.jsonl'
30
+ else:
31
+ name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full-metrics.jsonl'
32
+ file_names.append(name)
33
+
34
+
35
+ #list to store json files data
36
+ file_contents = []
37
+
38
+ downloaded = 0
39
+ not_downloaded = 0
40
+
41
+ #print(file_names)
42
+ #print(bucket)
43
+ #sys.exit(-1)
44
+
45
+ #iterate over the files
46
+ for file_name in file_names:
47
+ # Get the file
48
+ blob = bucket.get_blob(file_name)
49
+ print(f'gs://{bucket_name}/{file_name}')
50
+
51
+ if not blob:
52
+ #print(f"Unable to download {file_name}")
53
+ not_downloaded+=1
54
+ continue
55
+ else:
56
+ downloaded+=1
57
+
58
+ content = blob.download_as_string().decode("utf-8")
59
+
60
+ #print(file_name)
61
+ #print(content)
62
+
63
+ # Split the content by newline
64
+ lines = content.split("\n")
65
+
66
+ #iterate over the lines
67
+ for n,line in enumerate(lines):
68
+ if not line:
69
+ continue
70
+ #print(line)
71
+ #print(file_name)
72
+ data = json.loads(line)
73
+ data['base_file_name'] = file_name
74
+ pretraining_steps = 0 #re.search(r"(voc_|voc-full_|voc-full-scratch_|voc-scratch_)(.*?)(?=/)", file_name).group(2)
75
+ data['pretraining_steps'] = int(pretraining_steps)
76
+ data['finetuning_steps'] = int(str(data['step'])[-4:])
77
+ data['vocab'] = re.search(r"_(\w+?)-metric", file_name).group(1)
78
+ data['experiment'] = re.search(r"_exp(\w+?)_", file_name).group(1)
79
+ data['version'] = re.search(r"_v(\w+?)/", file_name).group(1)
80
+ data['experiment_name'] = re.search(r"exp\d+_(.*?)_v", file_name).group(1)
81
+ file_contents.append(data)
82
+
83
+ print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded")
84
+
85
+ df = pd.json_normalize(file_contents)
86
+ df = df.drop_duplicates(subset=['step','experiment','version']).reset_index()
87
+ only_5000 = df[df["finetuning_steps"] == 5000]
88
+ grouped_at_5000 = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
89
+ average_at_5000 = grouped_at_5000.mean().reset_index()
90
+ average_at_5000 = average_at_5000.assign(num_experiments=grouped_at_5000.size().values)
91
+ only_3000 = df[df["finetuning_steps"] == 3000]
92
+ grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
93
+ average_at_3000 = grouped.mean().reset_index()
94
+ average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values)
95
+
96
+ #print(average_at_3000.to_string(index=False))
97
+ print(average_at_5000.to_string(index=False))
98
+
99
+ print("\nNot complete:")
100
+ uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5]
101
+ print(uncomplete)
102
+
103
+ df.to_json("stats_various/all.jsonl", orient="records", lines=True)
104
+ df.to_csv("stats_various/all.csv", index=False)
105
+
106
+ only_5000.to_json("stats_various/only_5000.jsonl", orient="records", lines=True)
107
+ only_5000.to_csv("stats_various/only_5000.csv", index=False)
108
+
109
+ average_at_5000.to_json("stats_various/average_at_5000.jsonl", orient="records", lines=True)
110
+ average_at_5000.to_csv("stats_various/average_at_5000.csv", index=False)
111
+
112
+
113
+ print(f"Files exported to stats")
114
+
115
+
116
+
my_metrics.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import sklearn.metrics
2
+ import numpy as np
3
+
4
+ def f1_macro(targets, predictions):
5
+ targets, predictions = np.asarray(targets).astype(str), np.asarray(predictions).astype(str)
6
+ return {"f1_macro": 100*sklearn.metrics.f1_score(targets, predictions, average='macro')}
7
+
nb_nn_10000.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13a5bc673760b418473cc6c7636746d531e6ae261879720ae2ab081e4c08c404
3
+ size 2063855
nb_nn_dev_10000.tsv ADDED
The diff for this file is too large to render. See raw diff
 
nb_nn_test_10000.tsv ADDED
The diff for this file is too large to render. See raw diff
 
nb_nn_train_10000.tsv ADDED
The diff for this file is too large to render. See raw diff
 
tasks.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /home/perk/mymodel/categorisation-mt5x/tasks.py
2
+
3
+
4
+ import functools
5
+ import seqio
6
+ import my_metrics
7
+ import tensorflow_datasets as tfds
8
+ from t5.evaluation import metrics
9
+ from t5.data import preprocessors
10
+ #import my_preprocessors
11
+ import t5
12
+ import tensorflow.compat.v1 as tf
13
+
14
+
15
+
16
+ tsv_parliament_path = {
17
+ "train": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/train.tsv",
18
+ "validation": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/dev.tsv",
19
+ "test": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/test.tsv"
20
+ }
21
+
22
+ tsv_translate_path = {
23
+ "train": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/train.tsv",
24
+ "validation": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/dev.tsv",
25
+ "test": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/test.tsv"
26
+ }
27
+
28
+ tsv_translate_long_path = {
29
+ "train": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/train_long.tsv",
30
+ "validation": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/dev.tsv",
31
+ "test": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/test.tsv"
32
+ }
33
+
34
+ tsv_sentiment_path = {
35
+ "train": "gs://notram-public/finetune_datasets/norec_sentiment/train.tsv",
36
+ "validation": "gs://notram-public/finetune_datasets/norec_sentiment/dev.tsv",
37
+ "test": "gs://notram-public/finetune_datasets/norec_sentiment/test.tsv"
38
+ }
39
+
40
+ json_angry_tweets_path = {
41
+ "train": "gs://notram-public/finetune_datasets/angry_tweets/train.jsonl",
42
+ "validation": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl",
43
+ "test": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl"
44
+ }
45
+
46
+ tsv_angry_tweets_path = {
47
+ "train": "gs://notram-public/finetune_datasets/angry_tweets/train.tsv",
48
+ "validation": "gs://notram-public/finetune_datasets/angry_tweets/test.tsv",
49
+ "test": "gs://notram-public/finetune_datasets/angry_tweets/test.tsv"
50
+ }
51
+
52
+
53
+ tsv_dane_path = {
54
+ "train": "gs://notram-public/finetune_datasets/dane/train.tsv",
55
+ "validation": "gs://notram-public/finetune_datasets/dane/test.tsv",
56
+ "test": "gs://notram-public/finetune_datasets/dane/test.tsv"
57
+ }
58
+
59
+ tsv_dane_tokens_path = {
60
+ "train": "gs://notram-public/finetune_datasets/dane/train_tokens.tsv",
61
+ "validation": "gs://notram-public/finetune_datasets/dane/test_tokens.tsv",
62
+ "test": "gs://notram-public/finetune_datasets/dane/test_tokens.tsv"
63
+ }
64
+
65
+
66
+ tsv_dane_long_tokens_path = {
67
+ "train": "gs://notram-public/finetune_datasets/dane/train_long_tokens.tsv",
68
+ "validation": "gs://notram-public/finetune_datasets/dane/test_long_tokens.tsv",
69
+ "test": "gs://notram-public/finetune_datasets/dane/test_long_tokens.tsv"
70
+ }
71
+
72
+
73
+ #vocabulary = seqio.SentencePieceVocabulary(
74
+ # 'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
75
+ scand_vocabulary=seqio.SentencePieceVocabulary('gs://nb-t5/t5/vocabs/wikipedia/no-da-en-sv-nn-is_32000_unigram.sp.model', extra_ids=100)
76
+ eng_vocabulary=seqio.SentencePieceVocabulary('gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model', extra_ids=0)
77
+ mt5_vocabulary=seqio.SentencePieceVocabulary('gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
78
+
79
+ DEFAULT_OUTPUT_FEATURES = {
80
+ "inputs": seqio.Feature(
81
+ vocabulary=eng_vocabulary, add_eos=True,
82
+ required=False),
83
+ "targets": seqio.Feature(
84
+ vocabulary=eng_vocabulary, add_eos=True)
85
+ }
86
+
87
+
88
+ SCAND_OUTPUT_FEATURES = {
89
+ "inputs": seqio.Feature(
90
+ vocabulary=scand_vocabulary, add_eos=True,
91
+ required=False),
92
+ "targets": seqio.Feature(
93
+ vocabulary=scand_vocabulary, add_eos=True)
94
+ }
95
+
96
+ MT5_OUTPUT_FEATURES = {
97
+ "inputs": seqio.Feature(
98
+ vocabulary=mt5_vocabulary, add_eos=True,
99
+ required=False),
100
+ "targets": seqio.Feature(
101
+ vocabulary=mt5_vocabulary, add_eos=True)
102
+ }
103
+
104
+
105
+
106
+ def categorise_preprocessor(ds):
107
+ def normalize_text(text):
108
+ """Lowercase and remove quotes from a TensorFlow string."""
109
+ #text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
110
+ ...
111
+ return text
112
+
113
+ def to_inputs_and_targets(ex):
114
+ """Map {"source": ..., "source": ...}->{"target": ..., "target": ...}."""
115
+ return {
116
+ "inputs":
117
+ tf.strings.join(
118
+ [normalize_text(ex["source"])]),
119
+ "targets":
120
+ tf.strings.join(
121
+ [normalize_text(ex["target"])]),
122
+ }
123
+ return ds.map(to_inputs_and_targets,
124
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
125
+
126
+
127
+ seqio.TaskRegistry.add(
128
+ "parliament",
129
+ source=seqio.TextLineDataSource(
130
+ split_to_filepattern=tsv_parliament_path,
131
+ #num_input_examples=num_nq_examples
132
+ ),
133
+ preprocessors=[
134
+ functools.partial(
135
+ t5.data.preprocessors.parse_tsv,
136
+ field_names=["target","source"]),
137
+ categorise_preprocessor,
138
+ seqio.preprocessors.tokenize_and_append_eos,
139
+ ],
140
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro],
141
+ output_features=DEFAULT_OUTPUT_FEATURES,
142
+ )
143
+
144
+ seqio.TaskRegistry.add(
145
+ "sentiment",
146
+ source=seqio.TextLineDataSource(
147
+ split_to_filepattern=tsv_sentiment_path,
148
+ #num_input_examples=num_nq_examples
149
+ ),
150
+ preprocessors=[
151
+ functools.partial(
152
+ t5.data.preprocessors.parse_tsv,
153
+ field_names=["target","source"]),
154
+ categorise_preprocessor,
155
+ seqio.preprocessors.tokenize_and_append_eos,
156
+ ],
157
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro],
158
+ output_features=DEFAULT_OUTPUT_FEATURES,
159
+ )
160
+
161
+
162
+ seqio.TaskRegistry.add(
163
+ "translate",
164
+ source=seqio.TextLineDataSource(
165
+ split_to_filepattern=tsv_translate_path,
166
+ #num_input_examples=num_nq_examples
167
+ ),
168
+ preprocessors=[
169
+ functools.partial(
170
+ t5.data.preprocessors.parse_tsv,
171
+ field_names=["source","target"]),
172
+ categorise_preprocessor,
173
+ seqio.preprocessors.tokenize_and_append_eos,
174
+ ],
175
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
176
+ output_features=DEFAULT_OUTPUT_FEATURES,
177
+ )
178
+
179
+ seqio.TaskRegistry.add(
180
+ "translate_long_mt5",
181
+ source=seqio.TextLineDataSource(
182
+ split_to_filepattern=tsv_translate_long_path,
183
+ #num_input_examples=num_nq_examples
184
+ ),
185
+ preprocessors=[
186
+ functools.partial(
187
+ t5.data.preprocessors.parse_tsv,
188
+ field_names=["source","target"]),
189
+ categorise_preprocessor,
190
+ seqio.preprocessors.tokenize_and_append_eos,
191
+ ],
192
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
193
+ output_features=MT5_OUTPUT_FEATURES,
194
+ )
195
+
196
+ seqio.TaskRegistry.add(
197
+ "translate_long_scand",
198
+ source=seqio.TextLineDataSource(
199
+ split_to_filepattern=tsv_translate_long_path,
200
+ #num_input_examples=num_nq_examples
201
+ ),
202
+ preprocessors=[
203
+ functools.partial(
204
+ t5.data.preprocessors.parse_tsv,
205
+ field_names=["source","target"]),
206
+ categorise_preprocessor,
207
+ seqio.preprocessors.tokenize_and_append_eos,
208
+ ],
209
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
210
+ output_features=SCAND_OUTPUT_FEATURES,
211
+ )
212
+
213
+ seqio.TaskRegistry.add(
214
+ "translate_long",
215
+ source=seqio.TextLineDataSource(
216
+ split_to_filepattern=tsv_translate_long_path,
217
+ #num_input_examples=num_nq_examples
218
+ ),
219
+ preprocessors=[
220
+ functools.partial(
221
+ t5.data.preprocessors.parse_tsv,
222
+ field_names=["source","target"]),
223
+ categorise_preprocessor,
224
+ seqio.preprocessors.tokenize_and_append_eos,
225
+ ],
226
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
227
+ output_features=DEFAULT_OUTPUT_FEATURES,
228
+ )
229
+