testing
Browse files- README.md +4 -0
- __pycache__/my_metrics.cpython-38.pyc +0 -0
- __pycache__/tasks.cpython-38.pyc +0 -0
- batch_nynorsk_NCC_base.sh +64 -0
- batch_various.sh +75 -0
- finetune_translate_base.gin +33 -0
- finetune_translate_base_mt5.gin +34 -0
- finetune_translate_base_scand.gin +37 -0
- finetune_translate_large.gin +33 -0
- finetune_translate_large_mt5.gin +34 -0
- finetune_translate_small_mt5.gin +34 -0
- gdrive +0 -0
- generate_stats.py +112 -0
- generate_stats_various.py +116 -0
- my_metrics.py +7 -0
- nb_nn_10000.zip +3 -0
- nb_nn_dev_10000.tsv +0 -0
- nb_nn_test_10000.tsv +0 -0
- nb_nn_train_10000.tsv +0 -0
- tasks.py +229 -0
README.md
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
---
|
4 |
+
Evaluate nynorsk translateion
|
__pycache__/my_metrics.cpython-38.pyc
ADDED
Binary file (455 Bytes). View file
|
|
__pycache__/tasks.cpython-38.pyc
ADDED
Binary file (4.36 kB). View file
|
|
batch_nynorsk_NCC_base.sh
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
PROJECT_DIR=${HOME}"/models/t5-nynorsk-evaluator"
|
3 |
+
export PYTHONPATH=${PROJECT_DIR}
|
4 |
+
#INITIAL_CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000\"
|
5 |
+
#TRAIN_STEPS=1505000
|
6 |
+
|
7 |
+
|
8 |
+
FINETUNE_STEPS=5000
|
9 |
+
MODEL_BUCKET_DIR="gs://nb-t5x-us-central2/finetuned/"
|
10 |
+
EVAL_PREFIX="ul2test/eval_nynorsk_"
|
11 |
+
CHECKPOINT_BUCKET_DIR="gs://nb-t5x-us-central2/"
|
12 |
+
CHECKPOINT_LIST=("exp1-t5-base-ul2-engvoc" "exp2-t5-base-ul2-scandvoc" "exp3-t5-base-span-engvoc" "exp4-t5-base-span-scandvoc" "exp5-t5-base-ul2-scandvoc-full" "exp6-t5-base-span-scandvoc-full" "exp7-t5-base-ul2-511-scandvoc" "exp8-t5-base-span-511-scandvoc" "exp9-t5-base-ul2-mt5voc" "exp10-t5-base-span-mt5voc" "exp11-t5-base-ul2-511-scandvoc-full" "exp12-t5-base-span-511-scandvoc-full" "exp13-t5-base-ul2-mt5voc-full" "exp14-t5-base-span-mt5voc-full" "exp15-t5-base-ul2-511-scandvoc-full-scratch" "exp16-t5-base-span-511-scandvoc-full-scratch" "exp17-t5-small-ul2-mt5voc-scratch" "exp18-t5-small-span-mt5voc-scratch" "exp19-t5-small-ul2-mt5voc" "exp20-t5-small-span-mt5voc" "exp21-t5-small-ul2-mt5voc-full" "exp22-t5-small-span-mt5voc-full")
|
13 |
+
|
14 |
+
|
15 |
+
PRETUNE_START_LIST=(100000 200000 300000 400000 500000 1000000 1100000 1184000 1200000 1204000 1284000 1300000 1384000 1400000 1484000 1500000)
|
16 |
+
VERSION_LIST=("1" "2" "3" "4" "5")
|
17 |
+
|
18 |
+
index=$(($1 + 0))
|
19 |
+
|
20 |
+
if [ $index -lt 1 ] || [ $index -gt ${#CHECKPOINT_LIST[@]} ] || [ $# -ne 1 ]; then
|
21 |
+
echo "Error: You need to provide the number of the checkpoints below as a parameter."
|
22 |
+
for i in "${!CHECKPOINT_LIST[@]}"; do
|
23 |
+
echo "$((i+1)). ${CHECKPOINT_LIST[i]}"
|
24 |
+
done
|
25 |
+
|
26 |
+
exit 1
|
27 |
+
fi
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
for v in "${VERSION_LIST[@]}"; do
|
32 |
+
for s in "${PRETUNE_START_LIST[@]}"; do
|
33 |
+
INITIAL_CHECKPOINT_PATH="${CHECKPOINT_BUCKET_DIR}${CHECKPOINT_LIST[$((index-1))]}/checkpoint_${s}"
|
34 |
+
TRAIN_STEPS=$((s+FINETUNE_STEPS))
|
35 |
+
if [[ "$INITIAL_CHECKPOINT_PATH" == *"engvoc"* ]]; then
|
36 |
+
GIN_FILE="finetune_translate_base.gin"
|
37 |
+
MIXTURE_OR_TASK_NAME="translate_long"
|
38 |
+
|
39 |
+
elif [[ "$INITIAL_CHECKPOINT_PATH" == *"scandvoc"* ]]; then
|
40 |
+
GIN_FILE="finetune_translate_base_scand.gin"
|
41 |
+
MIXTURE_OR_TASK_NAME="translate_long_scand"
|
42 |
+
else
|
43 |
+
if [[ "$INITIAL_CHECKPOINT_PATH" == *"small"* ]]; then
|
44 |
+
GIN_FILE="finetune_translate_small_mt5.gin"
|
45 |
+
else
|
46 |
+
GIN_FILE="finetune_translate_base_mt5.gin"
|
47 |
+
fi
|
48 |
+
MIXTURE_OR_TASK_NAME="translate_long_mt5"
|
49 |
+
fi
|
50 |
+
|
51 |
+
MODEL_DIR="${MODEL_BUCKET_DIR}${EVAL_PREFIX}v${v}_${CHECKPOINT_LIST[$((index-1))]}_${s}"
|
52 |
+
command="python3 ../../t5x/t5x/train.py --gin_search_paths=\"./\" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file=${GIN_FILE} --gin.INITIAL_CHECKPOINT_PATH=\\\"${INITIAL_CHECKPOINT_PATH}\\\" --gin.MIXTURE_OR_TASK_NAME=\\\"${MIXTURE_OR_TASK_NAME}\\\" --gin.MODEL_DIR=\\\"${MODEL_DIR}\\\""
|
53 |
+
echo "${command}"
|
54 |
+
# Uncomment the next line to run the command:
|
55 |
+
eval "${command}"
|
56 |
+
done
|
57 |
+
done
|
58 |
+
|
59 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v1\" &&
|
60 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v2\" &&
|
61 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v3\" &&
|
62 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v4\" &&
|
63 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v5\"
|
64 |
+
|
batch_various.sh
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
PROJECT_DIR=${HOME}"/models/t5-nynorsk-norbench"
|
3 |
+
export PYTHONPATH=${PROJECT_DIR}
|
4 |
+
|
5 |
+
FINETUNE_STEPS=5000
|
6 |
+
EVAL_PREFIX="norbench/eval_translate_"
|
7 |
+
MODEL_BUCKET_DIR="gs://pere-north-t5x/finetuned/"
|
8 |
+
|
9 |
+
CHECKPOINT_LIST=(
|
10 |
+
"pere-north-t5x/pretrained_models/small/norwegian_NCC_plus_English_t5x_small/checkpoint_1500000"
|
11 |
+
"pere-north-t5x/pretrained_models/base/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
|
12 |
+
"pere-north-t5x/pretrained_models/large/norwegian_NCC_plus_English_t5x_large/checkpoint_1500000"
|
13 |
+
"t5-data/pretrained_models/t5x/mt5_small/checkpoint_1000000"
|
14 |
+
"t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
|
15 |
+
"t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
|
16 |
+
)
|
17 |
+
|
18 |
+
NAME_LIST=(
|
19 |
+
"north_t5_small_NCC"
|
20 |
+
"north_t5_base_NCC"
|
21 |
+
"north_t5_large_NCC"
|
22 |
+
"mt5_small"
|
23 |
+
"mt5_base"
|
24 |
+
"mt5_large"
|
25 |
+
)
|
26 |
+
|
27 |
+
TASK_LIST=("translate_long_mt5" "translate_long_mt5" "translate_long_mt5" "translate_long_mt5" "translate_long_mt5" "translate_long_mt5")
|
28 |
+
|
29 |
+
GIN_LIST=(
|
30 |
+
"finetune_translate_small_mt5.gin"
|
31 |
+
"finetune_translate_base_mt5.gin"
|
32 |
+
"finetune_translate_large_mt5.gin"
|
33 |
+
"finetune_translate_small_mt5.gin"
|
34 |
+
"finetune_translate_base_mt5.gin"
|
35 |
+
"finetune_translate_large_mt5.gin"
|
36 |
+
)
|
37 |
+
|
38 |
+
START_LIST=(1000000 1000000 1000000 1500000 1500000 1500000)
|
39 |
+
EXP_LIST=(115 116 117 118 119 120)
|
40 |
+
|
41 |
+
|
42 |
+
VERSION_LIST=("1" "2" "3" "4" "5")
|
43 |
+
|
44 |
+
index=$(($1 + 0))
|
45 |
+
|
46 |
+
if [ $index -lt 1 ] || [ $index -gt ${#CHECKPOINT_LIST[@]} ] || [ $# -ne 1 ]; then
|
47 |
+
echo "Error: You need to provide the number of the checkpoints below as a parameter."
|
48 |
+
for i in "${!CHECKPOINT_LIST[@]}"; do
|
49 |
+
echo "$((i+1)). ${CHECKPOINT_LIST[i]}"
|
50 |
+
done
|
51 |
+
|
52 |
+
exit 1
|
53 |
+
fi
|
54 |
+
|
55 |
+
for v in "${VERSION_LIST[@]}"; do
|
56 |
+
i=($index-1)
|
57 |
+
INITIAL_CHECKPOINT_PATH="gs://${CHECKPOINT_LIST[i]}"
|
58 |
+
|
59 |
+
TRAIN_STEPS=$((START_LIST[i]+FINETUNE_STEPS))
|
60 |
+
GIN_FILE=${GIN_LIST[i]}
|
61 |
+
MIXTURE_OR_TASK_NAME=${TASK_LIST[i]}
|
62 |
+
MODEL_DIR="${MODEL_BUCKET_DIR}${EVAL_PREFIX}exp${EXP_LIST[i]}_${NAME_LIST[i]}_v${v}"
|
63 |
+
|
64 |
+
command="python3 ../../t5x/t5x/train.py --gin_search_paths=\"./\" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file=${GIN_FILE} --gin.INITIAL_CHECKPOINT_PATH=\\\"${INITIAL_CHECKPOINT_PATH}\\\" --gin.MIXTURE_OR_TASK_NAME=\\\"${MIXTURE_OR_TASK_NAME}\\\" --gin.MODEL_DIR=\\\"${MODEL_DIR}\\\""
|
65 |
+
echo "${command}"
|
66 |
+
# Uncomment the next line to run the command:
|
67 |
+
eval "${command}"
|
68 |
+
done
|
69 |
+
|
70 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v1\" &&
|
71 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v2\" &&
|
72 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v3\" &&
|
73 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v4\" &&
|
74 |
+
#python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin.TRAIN_STEPS=${TRAIN_STEPS} --gin_file="finetune_translate_base.gin" --gin.INITIAL_CHECKPOINT_PATH=${INITIAL_CHECKPOINT_PATH} --gin.MIXTURE_OR_TASK_NAME=\"translate_long\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v5\"
|
75 |
+
|
finetune_translate_base.gin
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __gin__ import dynamic_registration
|
2 |
+
import tasks
|
3 |
+
|
4 |
+
import __main__ as train_script
|
5 |
+
from t5.data import mixtures
|
6 |
+
from t5x import models
|
7 |
+
from t5x import partitioning
|
8 |
+
from t5x import utils
|
9 |
+
|
10 |
+
include "t5x/examples/t5/t5_1_1/base.gin"
|
11 |
+
include "t5x/configs/runs/finetune.gin"
|
12 |
+
|
13 |
+
MIXTURE_OR_TASK_NAME = %gin.REQUIRED
|
14 |
+
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
|
15 |
+
INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
|
16 |
+
TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
17 |
+
USE_CACHED_TASKS = False
|
18 |
+
DROPOUT_RATE = 0.1
|
19 |
+
RANDOM_SEED = 0
|
20 |
+
|
21 |
+
#Fixing a small error
|
22 |
+
infer_eval/utils.DatasetConfig:
|
23 |
+
task_feature_lengths = %TASK_FEATURE_LENGTHS
|
24 |
+
|
25 |
+
#Saving every 1000 steps
|
26 |
+
utils.SaveCheckpointConfig:
|
27 |
+
period = 1000
|
28 |
+
keep = 1 # number of checkpoints to keep
|
29 |
+
|
30 |
+
# Might have to ba changed based on architecture
|
31 |
+
# partitioning.PjitPartitioner.num_partitions = 1
|
32 |
+
|
33 |
+
|
finetune_translate_base_mt5.gin
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __gin__ import dynamic_registration
|
2 |
+
import tasks
|
3 |
+
import seqio
|
4 |
+
|
5 |
+
import __main__ as train_script
|
6 |
+
from t5.data import mixtures
|
7 |
+
from t5x import models
|
8 |
+
from t5x import partitioning
|
9 |
+
from t5x import utils
|
10 |
+
|
11 |
+
include 't5x/examples/t5/mt5/base.gin'
|
12 |
+
include "t5x/configs/runs/finetune.gin"
|
13 |
+
|
14 |
+
MIXTURE_OR_TASK_NAME = %gin.REQUIRED
|
15 |
+
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
|
16 |
+
INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
|
17 |
+
TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
18 |
+
USE_CACHED_TASKS = False
|
19 |
+
DROPOUT_RATE = 0.1
|
20 |
+
RANDOM_SEED = 0
|
21 |
+
|
22 |
+
#Fixing a small error
|
23 |
+
infer_eval/utils.DatasetConfig:
|
24 |
+
task_feature_lengths = %TASK_FEATURE_LENGTHS
|
25 |
+
|
26 |
+
#Saving every 1000 steps
|
27 |
+
utils.SaveCheckpointConfig:
|
28 |
+
period = 1000
|
29 |
+
keep = 1 # number of checkpoints to keep
|
30 |
+
|
31 |
+
# Might have to ba changed based on architecture
|
32 |
+
# partitioning.PjitPartitioner.num_partitions = 1
|
33 |
+
|
34 |
+
|
finetune_translate_base_scand.gin
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __gin__ import dynamic_registration
|
2 |
+
import tasks
|
3 |
+
import seqio
|
4 |
+
|
5 |
+
import __main__ as train_script
|
6 |
+
from t5.data import mixtures
|
7 |
+
from t5x import models
|
8 |
+
from t5x import partitioning
|
9 |
+
from t5x import utils
|
10 |
+
|
11 |
+
include "t5x/examples/t5/t5_1_1/base.gin"
|
12 |
+
include "t5x/configs/runs/finetune.gin"
|
13 |
+
|
14 |
+
MIXTURE_OR_TASK_NAME = %gin.REQUIRED
|
15 |
+
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
|
16 |
+
INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
|
17 |
+
TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
18 |
+
USE_CACHED_TASKS = False
|
19 |
+
DROPOUT_RATE = 0.1
|
20 |
+
RANDOM_SEED = 0
|
21 |
+
|
22 |
+
#Fixing a small error
|
23 |
+
infer_eval/utils.DatasetConfig:
|
24 |
+
task_feature_lengths = %TASK_FEATURE_LENGTHS
|
25 |
+
|
26 |
+
#Saving every 1000 steps
|
27 |
+
utils.SaveCheckpointConfig:
|
28 |
+
period = 1000
|
29 |
+
keep = 1 # number of checkpoints to keep
|
30 |
+
|
31 |
+
# Might have to ba changed based on architecture
|
32 |
+
# partitioning.PjitPartitioner.num_partitions = 1
|
33 |
+
|
34 |
+
VOCABULARY = @seqio.SentencePieceVocabulary()
|
35 |
+
seqio.SentencePieceVocabulary.sentencepiece_model_file = "gs://nb-t5/t5/vocabs/wikipedia/no-da-en-sv-nn-is_32000_unigram.sp.model"
|
36 |
+
seqio.SentencePieceVocabulary.extra_ids = 100
|
37 |
+
|
finetune_translate_large.gin
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __gin__ import dynamic_registration
|
2 |
+
import tasks
|
3 |
+
|
4 |
+
import __main__ as train_script
|
5 |
+
from t5.data import mixtures
|
6 |
+
from t5x import models
|
7 |
+
from t5x import partitioning
|
8 |
+
from t5x import utils
|
9 |
+
|
10 |
+
include "t5x/examples/t5/t5_1_1/large.gin"
|
11 |
+
include "t5x/configs/runs/finetune.gin"
|
12 |
+
|
13 |
+
MIXTURE_OR_TASK_NAME = %gin.REQUIRED
|
14 |
+
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
|
15 |
+
INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
|
16 |
+
TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
17 |
+
USE_CACHED_TASKS = False
|
18 |
+
DROPOUT_RATE = 0.1
|
19 |
+
RANDOM_SEED = 0
|
20 |
+
|
21 |
+
#Fixing a small error
|
22 |
+
infer_eval/utils.DatasetConfig:
|
23 |
+
task_feature_lengths = %TASK_FEATURE_LENGTHS
|
24 |
+
|
25 |
+
#Saving every 1000 steps
|
26 |
+
utils.SaveCheckpointConfig:
|
27 |
+
period = 1000
|
28 |
+
keep = 1 # number of checkpoints to keep
|
29 |
+
|
30 |
+
# Might have to ba changed based on architecture
|
31 |
+
# partitioning.PjitPartitioner.num_partitions = 1
|
32 |
+
|
33 |
+
|
finetune_translate_large_mt5.gin
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __gin__ import dynamic_registration
|
2 |
+
import tasks
|
3 |
+
import seqio
|
4 |
+
|
5 |
+
import __main__ as train_script
|
6 |
+
from t5.data import mixtures
|
7 |
+
from t5x import models
|
8 |
+
from t5x import partitioning
|
9 |
+
from t5x import utils
|
10 |
+
|
11 |
+
include 't5x/examples/t5/mt5/large.gin'
|
12 |
+
include "t5x/configs/runs/finetune.gin"
|
13 |
+
|
14 |
+
MIXTURE_OR_TASK_NAME = %gin.REQUIRED
|
15 |
+
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
|
16 |
+
INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
|
17 |
+
TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
18 |
+
USE_CACHED_TASKS = False
|
19 |
+
DROPOUT_RATE = 0.1
|
20 |
+
RANDOM_SEED = 0
|
21 |
+
|
22 |
+
#Fixing a small error
|
23 |
+
infer_eval/utils.DatasetConfig:
|
24 |
+
task_feature_lengths = %TASK_FEATURE_LENGTHS
|
25 |
+
|
26 |
+
#Saving every 1000 steps
|
27 |
+
utils.SaveCheckpointConfig:
|
28 |
+
period = 1000
|
29 |
+
keep = 1 # number of checkpoints to keep
|
30 |
+
|
31 |
+
# Might have to ba changed based on architecture
|
32 |
+
# partitioning.PjitPartitioner.num_partitions = 1
|
33 |
+
|
34 |
+
|
finetune_translate_small_mt5.gin
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __gin__ import dynamic_registration
|
2 |
+
import tasks
|
3 |
+
import seqio
|
4 |
+
|
5 |
+
import __main__ as train_script
|
6 |
+
from t5.data import mixtures
|
7 |
+
from t5x import models
|
8 |
+
from t5x import partitioning
|
9 |
+
from t5x import utils
|
10 |
+
|
11 |
+
include 't5x/examples/t5/mt5/small.gin'
|
12 |
+
include "t5x/configs/runs/finetune.gin"
|
13 |
+
|
14 |
+
MIXTURE_OR_TASK_NAME = %gin.REQUIRED
|
15 |
+
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
|
16 |
+
INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
|
17 |
+
TRAIN_STEPS = %gin.REQUIRED # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
18 |
+
USE_CACHED_TASKS = False
|
19 |
+
DROPOUT_RATE = 0.1
|
20 |
+
RANDOM_SEED = 0
|
21 |
+
|
22 |
+
#Fixing a small error
|
23 |
+
infer_eval/utils.DatasetConfig:
|
24 |
+
task_feature_lengths = %TASK_FEATURE_LENGTHS
|
25 |
+
|
26 |
+
#Saving every 1000 steps
|
27 |
+
utils.SaveCheckpointConfig:
|
28 |
+
period = 1000
|
29 |
+
keep = 1 # number of checkpoints to keep
|
30 |
+
|
31 |
+
# Might have to ba changed based on architecture
|
32 |
+
# partitioning.PjitPartitioner.num_partitions = 1
|
33 |
+
|
34 |
+
|
gdrive
ADDED
File without changes
|
generate_stats.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from google.cloud import storage
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
import sys
|
6 |
+
|
7 |
+
# Create a storage client
|
8 |
+
client = storage.Client()
|
9 |
+
|
10 |
+
# Get the bucket
|
11 |
+
bucket_name = "nb-t5x-us-central2"
|
12 |
+
bucket = client.bucket(bucket_name)
|
13 |
+
|
14 |
+
|
15 |
+
#checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"]
|
16 |
+
|
17 |
+
checkpoints=["exp1-t5-base-ul2-engvoc","exp2-t5-base-ul2-scandvoc","exp3-t5-base-span-engvoc","exp4-t5-base-span-scandvoc","exp5-t5-base-ul2-scandvoc-full","exp6-t5-base-span-scandvoc-full","exp7-t5-base-ul2-511-scandvoc","exp8-t5-base-span-511-scandvoc","exp9-t5-base-ul2-mt5voc","exp10-t5-base-span-mt5voc","exp11-t5-base-ul2-511-scandvoc-full","exp12-t5-base-span-511-scandvoc-full","exp13-t5-base-ul2-mt5voc-full","exp14-t5-base-span-mt5voc-full","exp15-t5-base-ul2-511-scandvoc-full-scratch","exp16-t5-base-span-511-scandvoc-full-scratch","exp17-t5-small-ul2-mt5voc-scratch","exp18-t5-small-span-mt5voc-scratch","exp19-t5-small-ul2-mt5voc","exp20-t5-small-span-mt5voc","exp21-t5-small-ul2-mt5voc-full","exp22-t5-small-span-mt5voc-full"]
|
18 |
+
|
19 |
+
start=["100000","200000","300000","400000","500000","1000000","1100000","1184000","1200000","1204000","1284000","1300000","1384000","1400000","1484000","1500000"]
|
20 |
+
|
21 |
+
iterations=["1","2","3","4","5"]
|
22 |
+
file_names=[]
|
23 |
+
|
24 |
+
for i in iterations:
|
25 |
+
for c in checkpoints:
|
26 |
+
for s in start:
|
27 |
+
if "scand" in c:
|
28 |
+
name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long_scand-metrics.jsonl'
|
29 |
+
elif "mt5" in c:
|
30 |
+
name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long_mt5-metrics.jsonl'
|
31 |
+
else:
|
32 |
+
name = f'finetuned/ul2test/eval_nynorsk_v{i}_{c}_{s}/inference_eval/translate_long-metrics.jsonl'
|
33 |
+
file_names.append(name)
|
34 |
+
|
35 |
+
|
36 |
+
#list to store json files data
|
37 |
+
file_contents = []
|
38 |
+
|
39 |
+
downloaded = 0
|
40 |
+
not_downloaded = 0
|
41 |
+
|
42 |
+
#print(file_names)
|
43 |
+
#print(bucket)
|
44 |
+
#sys.exit(-1)
|
45 |
+
|
46 |
+
#iterate over the files
|
47 |
+
for file_name in file_names:
|
48 |
+
# Get the file
|
49 |
+
blob = bucket.get_blob(file_name)
|
50 |
+
print(f'gs://{bucket_name}/{file_name}')
|
51 |
+
|
52 |
+
if not blob:
|
53 |
+
#print(f"Unable to download {file_name}")
|
54 |
+
not_downloaded+=1
|
55 |
+
continue
|
56 |
+
else:
|
57 |
+
downloaded+=1
|
58 |
+
|
59 |
+
content = blob.download_as_string().decode("utf-8")
|
60 |
+
# Split the content by newline
|
61 |
+
lines = content.split("\n")
|
62 |
+
|
63 |
+
#iterate over the lines
|
64 |
+
for n,line in enumerate(lines):
|
65 |
+
if not line:
|
66 |
+
continue
|
67 |
+
#print(line)
|
68 |
+
#print(file_name)
|
69 |
+
data = json.loads(line)
|
70 |
+
data['base_file_name'] = file_name
|
71 |
+
pretraining_steps = re.search(r"(voc_|voc-full_|voc-full-scratch_|voc-scratch_)(.*?)(?=/)", file_name).group(2)
|
72 |
+
data['pretraining_steps'] = int(pretraining_steps)
|
73 |
+
data['finetuning_steps'] = data['step'] - int(pretraining_steps)
|
74 |
+
data['vocab'] = re.search(r"-(\w+?)voc", file_name).group(1)
|
75 |
+
data['experiment'] = re.search(r"_exp(\w+?)-", file_name).group(1)
|
76 |
+
data['version'] = re.search(r"_v(\w+?)_exp", file_name).group(1)
|
77 |
+
data['experiment_name'] = re.search(r"exp\d+-(.*?)_", file_name).group(1)
|
78 |
+
file_contents.append(data)
|
79 |
+
|
80 |
+
print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded")
|
81 |
+
|
82 |
+
df = pd.json_normalize(file_contents)
|
83 |
+
only_5000 = df[df["finetuning_steps"] == 5000]
|
84 |
+
grouped = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro", "bleu"]].groupby(["experiment","experiment_name","pretraining_steps"])
|
85 |
+
average_at_5000 = grouped.mean().reset_index()
|
86 |
+
average_at_5000 = average_at_5000.assign(num_experiments=grouped.size().values)
|
87 |
+
|
88 |
+
only_3000 = df[df["finetuning_steps"] == 3000]
|
89 |
+
grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro", "bleu"]].groupby(["experiment","experiment_name","pretraining_steps"])
|
90 |
+
average_at_3000 = grouped.mean().reset_index()
|
91 |
+
average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values)
|
92 |
+
|
93 |
+
#print(average_at_3000.to_string(index=False))
|
94 |
+
print(average_at_5000.to_string(index=False))
|
95 |
+
|
96 |
+
print("\nNot complete:")
|
97 |
+
uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5]
|
98 |
+
print(uncomplete)
|
99 |
+
|
100 |
+
|
101 |
+
df.to_json("stats/all.jsonl", orient="records", lines=True)
|
102 |
+
df.to_csv("stats/all.csv", index=False)
|
103 |
+
|
104 |
+
only_5000.to_json("stats/only_5000.jsonl", orient="records", lines=True)
|
105 |
+
only_5000.to_csv("stats/only_5000.csv", index=False)
|
106 |
+
|
107 |
+
average_at_5000.to_json("stats/average_at_5000.jsonl", orient="records", lines=True)
|
108 |
+
average_at_5000.to_csv("stats/average_at_5000.csv", index=False)
|
109 |
+
|
110 |
+
|
111 |
+
print(f"Files exported to stats")
|
112 |
+
|
generate_stats_various.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from google.cloud import storage
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
import sys
|
6 |
+
|
7 |
+
# Create a storage client
|
8 |
+
client = storage.Client()
|
9 |
+
|
10 |
+
# Get the bucket
|
11 |
+
bucket_name = "nb-t5x-us-central2"
|
12 |
+
bucket = client.bucket(bucket_name)
|
13 |
+
|
14 |
+
|
15 |
+
checkpoints=["exp115_mt5_small","exp116_north_t5_base_NCC","exp117_north_t5_base_NCC_lm","exp118_north_t5_base_scand3M","exp119_mt5_base","exp120_sab_base_2","exp121_sab_base_3","exp122_sab_base_4"]
|
16 |
+
|
17 |
+
|
18 |
+
start=["1500000","1600000","2000000","3000000","4000000"]
|
19 |
+
|
20 |
+
iterations=["1","2","3","4","5"]
|
21 |
+
file_names=[]
|
22 |
+
|
23 |
+
for i in iterations:
|
24 |
+
for c in checkpoints:
|
25 |
+
for s in start:
|
26 |
+
if "scand" in c:
|
27 |
+
name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full_scand-metrics.jsonl'
|
28 |
+
elif ("mt5" in c) or ("north" in c):
|
29 |
+
name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full_mt5-metrics.jsonl'
|
30 |
+
else:
|
31 |
+
name = f'finetuned/ul2test/eval_nynorsk_{c}_v{i}/inference_eval/translate_full-metrics.jsonl'
|
32 |
+
file_names.append(name)
|
33 |
+
|
34 |
+
|
35 |
+
#list to store json files data
|
36 |
+
file_contents = []
|
37 |
+
|
38 |
+
downloaded = 0
|
39 |
+
not_downloaded = 0
|
40 |
+
|
41 |
+
#print(file_names)
|
42 |
+
#print(bucket)
|
43 |
+
#sys.exit(-1)
|
44 |
+
|
45 |
+
#iterate over the files
|
46 |
+
for file_name in file_names:
|
47 |
+
# Get the file
|
48 |
+
blob = bucket.get_blob(file_name)
|
49 |
+
print(f'gs://{bucket_name}/{file_name}')
|
50 |
+
|
51 |
+
if not blob:
|
52 |
+
#print(f"Unable to download {file_name}")
|
53 |
+
not_downloaded+=1
|
54 |
+
continue
|
55 |
+
else:
|
56 |
+
downloaded+=1
|
57 |
+
|
58 |
+
content = blob.download_as_string().decode("utf-8")
|
59 |
+
|
60 |
+
#print(file_name)
|
61 |
+
#print(content)
|
62 |
+
|
63 |
+
# Split the content by newline
|
64 |
+
lines = content.split("\n")
|
65 |
+
|
66 |
+
#iterate over the lines
|
67 |
+
for n,line in enumerate(lines):
|
68 |
+
if not line:
|
69 |
+
continue
|
70 |
+
#print(line)
|
71 |
+
#print(file_name)
|
72 |
+
data = json.loads(line)
|
73 |
+
data['base_file_name'] = file_name
|
74 |
+
pretraining_steps = 0 #re.search(r"(voc_|voc-full_|voc-full-scratch_|voc-scratch_)(.*?)(?=/)", file_name).group(2)
|
75 |
+
data['pretraining_steps'] = int(pretraining_steps)
|
76 |
+
data['finetuning_steps'] = int(str(data['step'])[-4:])
|
77 |
+
data['vocab'] = re.search(r"_(\w+?)-metric", file_name).group(1)
|
78 |
+
data['experiment'] = re.search(r"_exp(\w+?)_", file_name).group(1)
|
79 |
+
data['version'] = re.search(r"_v(\w+?)/", file_name).group(1)
|
80 |
+
data['experiment_name'] = re.search(r"exp\d+_(.*?)_v", file_name).group(1)
|
81 |
+
file_contents.append(data)
|
82 |
+
|
83 |
+
print(f"\nTotally {downloaded} files downloaded, {not_downloaded} files not downloaded")
|
84 |
+
|
85 |
+
df = pd.json_normalize(file_contents)
|
86 |
+
df = df.drop_duplicates(subset=['step','experiment','version']).reset_index()
|
87 |
+
only_5000 = df[df["finetuning_steps"] == 5000]
|
88 |
+
grouped_at_5000 = only_5000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
|
89 |
+
average_at_5000 = grouped_at_5000.mean().reset_index()
|
90 |
+
average_at_5000 = average_at_5000.assign(num_experiments=grouped_at_5000.size().values)
|
91 |
+
only_3000 = df[df["finetuning_steps"] == 3000]
|
92 |
+
grouped = only_3000[["experiment_name","experiment","pretraining_steps", "accuracy", "f1_macro"]].groupby(["experiment","experiment_name","pretraining_steps"])
|
93 |
+
average_at_3000 = grouped.mean().reset_index()
|
94 |
+
average_at_3000 = average_at_3000.assign(rows_count=grouped.size().values)
|
95 |
+
|
96 |
+
#print(average_at_3000.to_string(index=False))
|
97 |
+
print(average_at_5000.to_string(index=False))
|
98 |
+
|
99 |
+
print("\nNot complete:")
|
100 |
+
uncomplete = average_at_5000[average_at_5000['num_experiments'] != 5]
|
101 |
+
print(uncomplete)
|
102 |
+
|
103 |
+
df.to_json("stats_various/all.jsonl", orient="records", lines=True)
|
104 |
+
df.to_csv("stats_various/all.csv", index=False)
|
105 |
+
|
106 |
+
only_5000.to_json("stats_various/only_5000.jsonl", orient="records", lines=True)
|
107 |
+
only_5000.to_csv("stats_various/only_5000.csv", index=False)
|
108 |
+
|
109 |
+
average_at_5000.to_json("stats_various/average_at_5000.jsonl", orient="records", lines=True)
|
110 |
+
average_at_5000.to_csv("stats_various/average_at_5000.csv", index=False)
|
111 |
+
|
112 |
+
|
113 |
+
print(f"Files exported to stats")
|
114 |
+
|
115 |
+
|
116 |
+
|
my_metrics.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sklearn.metrics
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
def f1_macro(targets, predictions):
|
5 |
+
targets, predictions = np.asarray(targets).astype(str), np.asarray(predictions).astype(str)
|
6 |
+
return {"f1_macro": 100*sklearn.metrics.f1_score(targets, predictions, average='macro')}
|
7 |
+
|
nb_nn_10000.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13a5bc673760b418473cc6c7636746d531e6ae261879720ae2ab081e4c08c404
|
3 |
+
size 2063855
|
nb_nn_dev_10000.tsv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nb_nn_test_10000.tsv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
nb_nn_train_10000.tsv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tasks.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /home/perk/mymodel/categorisation-mt5x/tasks.py
|
2 |
+
|
3 |
+
|
4 |
+
import functools
|
5 |
+
import seqio
|
6 |
+
import my_metrics
|
7 |
+
import tensorflow_datasets as tfds
|
8 |
+
from t5.evaluation import metrics
|
9 |
+
from t5.data import preprocessors
|
10 |
+
#import my_preprocessors
|
11 |
+
import t5
|
12 |
+
import tensorflow.compat.v1 as tf
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
tsv_parliament_path = {
|
17 |
+
"train": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/train.tsv",
|
18 |
+
"validation": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/dev.tsv",
|
19 |
+
"test": "gs://notram-public/finetune_datasets/parliament_speeches_1998_2016_frp_or_sv/test.tsv"
|
20 |
+
}
|
21 |
+
|
22 |
+
tsv_translate_path = {
|
23 |
+
"train": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/train.tsv",
|
24 |
+
"validation": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/dev.tsv",
|
25 |
+
"test": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/test.tsv"
|
26 |
+
}
|
27 |
+
|
28 |
+
tsv_translate_long_path = {
|
29 |
+
"train": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/train_long.tsv",
|
30 |
+
"validation": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/dev.tsv",
|
31 |
+
"test": "gs://nb-t5x-us-central2/corpus_bokmal_nynorsk/test.tsv"
|
32 |
+
}
|
33 |
+
|
34 |
+
tsv_sentiment_path = {
|
35 |
+
"train": "gs://notram-public/finetune_datasets/norec_sentiment/train.tsv",
|
36 |
+
"validation": "gs://notram-public/finetune_datasets/norec_sentiment/dev.tsv",
|
37 |
+
"test": "gs://notram-public/finetune_datasets/norec_sentiment/test.tsv"
|
38 |
+
}
|
39 |
+
|
40 |
+
json_angry_tweets_path = {
|
41 |
+
"train": "gs://notram-public/finetune_datasets/angry_tweets/train.jsonl",
|
42 |
+
"validation": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl",
|
43 |
+
"test": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl"
|
44 |
+
}
|
45 |
+
|
46 |
+
tsv_angry_tweets_path = {
|
47 |
+
"train": "gs://notram-public/finetune_datasets/angry_tweets/train.tsv",
|
48 |
+
"validation": "gs://notram-public/finetune_datasets/angry_tweets/test.tsv",
|
49 |
+
"test": "gs://notram-public/finetune_datasets/angry_tweets/test.tsv"
|
50 |
+
}
|
51 |
+
|
52 |
+
|
53 |
+
tsv_dane_path = {
|
54 |
+
"train": "gs://notram-public/finetune_datasets/dane/train.tsv",
|
55 |
+
"validation": "gs://notram-public/finetune_datasets/dane/test.tsv",
|
56 |
+
"test": "gs://notram-public/finetune_datasets/dane/test.tsv"
|
57 |
+
}
|
58 |
+
|
59 |
+
tsv_dane_tokens_path = {
|
60 |
+
"train": "gs://notram-public/finetune_datasets/dane/train_tokens.tsv",
|
61 |
+
"validation": "gs://notram-public/finetune_datasets/dane/test_tokens.tsv",
|
62 |
+
"test": "gs://notram-public/finetune_datasets/dane/test_tokens.tsv"
|
63 |
+
}
|
64 |
+
|
65 |
+
|
66 |
+
tsv_dane_long_tokens_path = {
|
67 |
+
"train": "gs://notram-public/finetune_datasets/dane/train_long_tokens.tsv",
|
68 |
+
"validation": "gs://notram-public/finetune_datasets/dane/test_long_tokens.tsv",
|
69 |
+
"test": "gs://notram-public/finetune_datasets/dane/test_long_tokens.tsv"
|
70 |
+
}
|
71 |
+
|
72 |
+
|
73 |
+
#vocabulary = seqio.SentencePieceVocabulary(
|
74 |
+
# 'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
|
75 |
+
scand_vocabulary=seqio.SentencePieceVocabulary('gs://nb-t5/t5/vocabs/wikipedia/no-da-en-sv-nn-is_32000_unigram.sp.model', extra_ids=100)
|
76 |
+
eng_vocabulary=seqio.SentencePieceVocabulary('gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model', extra_ids=0)
|
77 |
+
mt5_vocabulary=seqio.SentencePieceVocabulary('gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
|
78 |
+
|
79 |
+
DEFAULT_OUTPUT_FEATURES = {
|
80 |
+
"inputs": seqio.Feature(
|
81 |
+
vocabulary=eng_vocabulary, add_eos=True,
|
82 |
+
required=False),
|
83 |
+
"targets": seqio.Feature(
|
84 |
+
vocabulary=eng_vocabulary, add_eos=True)
|
85 |
+
}
|
86 |
+
|
87 |
+
|
88 |
+
SCAND_OUTPUT_FEATURES = {
|
89 |
+
"inputs": seqio.Feature(
|
90 |
+
vocabulary=scand_vocabulary, add_eos=True,
|
91 |
+
required=False),
|
92 |
+
"targets": seqio.Feature(
|
93 |
+
vocabulary=scand_vocabulary, add_eos=True)
|
94 |
+
}
|
95 |
+
|
96 |
+
MT5_OUTPUT_FEATURES = {
|
97 |
+
"inputs": seqio.Feature(
|
98 |
+
vocabulary=mt5_vocabulary, add_eos=True,
|
99 |
+
required=False),
|
100 |
+
"targets": seqio.Feature(
|
101 |
+
vocabulary=mt5_vocabulary, add_eos=True)
|
102 |
+
}
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
def categorise_preprocessor(ds):
|
107 |
+
def normalize_text(text):
|
108 |
+
"""Lowercase and remove quotes from a TensorFlow string."""
|
109 |
+
#text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
|
110 |
+
...
|
111 |
+
return text
|
112 |
+
|
113 |
+
def to_inputs_and_targets(ex):
|
114 |
+
"""Map {"source": ..., "source": ...}->{"target": ..., "target": ...}."""
|
115 |
+
return {
|
116 |
+
"inputs":
|
117 |
+
tf.strings.join(
|
118 |
+
[normalize_text(ex["source"])]),
|
119 |
+
"targets":
|
120 |
+
tf.strings.join(
|
121 |
+
[normalize_text(ex["target"])]),
|
122 |
+
}
|
123 |
+
return ds.map(to_inputs_and_targets,
|
124 |
+
num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
125 |
+
|
126 |
+
|
127 |
+
seqio.TaskRegistry.add(
|
128 |
+
"parliament",
|
129 |
+
source=seqio.TextLineDataSource(
|
130 |
+
split_to_filepattern=tsv_parliament_path,
|
131 |
+
#num_input_examples=num_nq_examples
|
132 |
+
),
|
133 |
+
preprocessors=[
|
134 |
+
functools.partial(
|
135 |
+
t5.data.preprocessors.parse_tsv,
|
136 |
+
field_names=["target","source"]),
|
137 |
+
categorise_preprocessor,
|
138 |
+
seqio.preprocessors.tokenize_and_append_eos,
|
139 |
+
],
|
140 |
+
metric_fns=[metrics.accuracy,my_metrics.f1_macro],
|
141 |
+
output_features=DEFAULT_OUTPUT_FEATURES,
|
142 |
+
)
|
143 |
+
|
144 |
+
seqio.TaskRegistry.add(
|
145 |
+
"sentiment",
|
146 |
+
source=seqio.TextLineDataSource(
|
147 |
+
split_to_filepattern=tsv_sentiment_path,
|
148 |
+
#num_input_examples=num_nq_examples
|
149 |
+
),
|
150 |
+
preprocessors=[
|
151 |
+
functools.partial(
|
152 |
+
t5.data.preprocessors.parse_tsv,
|
153 |
+
field_names=["target","source"]),
|
154 |
+
categorise_preprocessor,
|
155 |
+
seqio.preprocessors.tokenize_and_append_eos,
|
156 |
+
],
|
157 |
+
metric_fns=[metrics.accuracy,my_metrics.f1_macro],
|
158 |
+
output_features=DEFAULT_OUTPUT_FEATURES,
|
159 |
+
)
|
160 |
+
|
161 |
+
|
162 |
+
seqio.TaskRegistry.add(
|
163 |
+
"translate",
|
164 |
+
source=seqio.TextLineDataSource(
|
165 |
+
split_to_filepattern=tsv_translate_path,
|
166 |
+
#num_input_examples=num_nq_examples
|
167 |
+
),
|
168 |
+
preprocessors=[
|
169 |
+
functools.partial(
|
170 |
+
t5.data.preprocessors.parse_tsv,
|
171 |
+
field_names=["source","target"]),
|
172 |
+
categorise_preprocessor,
|
173 |
+
seqio.preprocessors.tokenize_and_append_eos,
|
174 |
+
],
|
175 |
+
metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
|
176 |
+
output_features=DEFAULT_OUTPUT_FEATURES,
|
177 |
+
)
|
178 |
+
|
179 |
+
seqio.TaskRegistry.add(
|
180 |
+
"translate_long_mt5",
|
181 |
+
source=seqio.TextLineDataSource(
|
182 |
+
split_to_filepattern=tsv_translate_long_path,
|
183 |
+
#num_input_examples=num_nq_examples
|
184 |
+
),
|
185 |
+
preprocessors=[
|
186 |
+
functools.partial(
|
187 |
+
t5.data.preprocessors.parse_tsv,
|
188 |
+
field_names=["source","target"]),
|
189 |
+
categorise_preprocessor,
|
190 |
+
seqio.preprocessors.tokenize_and_append_eos,
|
191 |
+
],
|
192 |
+
metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
|
193 |
+
output_features=MT5_OUTPUT_FEATURES,
|
194 |
+
)
|
195 |
+
|
196 |
+
seqio.TaskRegistry.add(
|
197 |
+
"translate_long_scand",
|
198 |
+
source=seqio.TextLineDataSource(
|
199 |
+
split_to_filepattern=tsv_translate_long_path,
|
200 |
+
#num_input_examples=num_nq_examples
|
201 |
+
),
|
202 |
+
preprocessors=[
|
203 |
+
functools.partial(
|
204 |
+
t5.data.preprocessors.parse_tsv,
|
205 |
+
field_names=["source","target"]),
|
206 |
+
categorise_preprocessor,
|
207 |
+
seqio.preprocessors.tokenize_and_append_eos,
|
208 |
+
],
|
209 |
+
metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
|
210 |
+
output_features=SCAND_OUTPUT_FEATURES,
|
211 |
+
)
|
212 |
+
|
213 |
+
seqio.TaskRegistry.add(
|
214 |
+
"translate_long",
|
215 |
+
source=seqio.TextLineDataSource(
|
216 |
+
split_to_filepattern=tsv_translate_long_path,
|
217 |
+
#num_input_examples=num_nq_examples
|
218 |
+
),
|
219 |
+
preprocessors=[
|
220 |
+
functools.partial(
|
221 |
+
t5.data.preprocessors.parse_tsv,
|
222 |
+
field_names=["source","target"]),
|
223 |
+
categorise_preprocessor,
|
224 |
+
seqio.preprocessors.tokenize_and_append_eos,
|
225 |
+
],
|
226 |
+
metric_fns=[metrics.accuracy,my_metrics.f1_macro,metrics.bleu],
|
227 |
+
output_features=DEFAULT_OUTPUT_FEATURES,
|
228 |
+
)
|
229 |
+
|