test
Browse files- batch_parliament_base_ref.sh +19 -0
- batch_sentiment_base.sh +6 -6
- corpus/angry_tweets/test.jsonl +0 -0
- corpus/angry_tweets/train.jsonl +0 -0
- finetune_categorisation_base.gin +3 -2
- finetune_categorisation_large.gin +4 -4
- finetune_large.sh +1 -1
- log/eval_results_t1v-n-7b23714e-w-0.jsonl +3 -1
- my_preprocessors.py +67 -0
- preprocessors.py +0 -0
- tasks.py +44 -1
batch_parliament_base_ref.sh
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
|
2 |
+
export PYTHONPATH=${PROJECT_DIR}
|
3 |
+
|
4 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v1_mt5x_base_1_000_000_parliament\"
|
5 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v2_mt5x_base_1_000_000_parliament\"
|
6 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v3_mt5x_base_1_000_000_parliament\"
|
7 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v4_mt5x_base_1_000_000_parliament\"
|
8 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v5_mt5x_base_1_000_000_parliament\"
|
9 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
10 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
11 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
12 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
13 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
14 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_t5x_base_1_000_000_parliament/checkpoint_1010000\"
|
15 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
16 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
17 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
18 |
+
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
|
19 |
+
|
batch_sentiment_base.sh
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
|
2 |
export PYTHONPATH=${PROJECT_DIR}
|
3 |
|
4 |
-
|
5 |
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
10 |
-
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
11 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
12 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
13 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
14 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
15 |
-
python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
16 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
17 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
18 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
|
|
1 |
PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
|
2 |
export PYTHONPATH=${PROJECT_DIR}
|
3 |
|
4 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
|
5 |
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
|
6 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
|
7 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
|
8 |
+
python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
|
9 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
10 |
+
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
11 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
12 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
13 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
14 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
15 |
+
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
16 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
17 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
18 |
#python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
|
corpus/angry_tweets/test.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
corpus/angry_tweets/train.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
finetune_categorisation_base.gin
CHANGED
@@ -12,8 +12,9 @@ include "t5x/configs/runs/finetune.gin"
|
|
12 |
|
13 |
MIXTURE_OR_TASK_NAME = %gin.REQUIRED
|
14 |
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 2}
|
15 |
-
INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
|
16 |
-
|
|
|
17 |
USE_CACHED_TASKS = False
|
18 |
DROPOUT_RATE = 0.1
|
19 |
RANDOM_SEED = 0
|
|
|
12 |
|
13 |
MIXTURE_OR_TASK_NAME = %gin.REQUIRED
|
14 |
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 2}
|
15 |
+
#INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
|
16 |
+
INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
|
17 |
+
TRAIN_STEPS = 1_010_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
18 |
USE_CACHED_TASKS = False
|
19 |
DROPOUT_RATE = 0.1
|
20 |
RANDOM_SEED = 0
|
finetune_categorisation_large.gin
CHANGED
@@ -10,8 +10,8 @@ from t5x import utils
|
|
10 |
include "t5x/examples/t5/mt5/large.gin"
|
11 |
include "t5x/configs/runs/finetune.gin"
|
12 |
|
13 |
-
MIXTURE_OR_TASK_NAME = "
|
14 |
-
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets":
|
15 |
TRAIN_STEPS = 1_005_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
16 |
USE_CACHED_TASKS = False
|
17 |
DROPOUT_RATE = 0.1
|
@@ -25,9 +25,9 @@ RANDOM_SEED = 0
|
|
25 |
#LOSS_NORMALIZING_FACTOR = 234496
|
26 |
|
27 |
#INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
|
28 |
-
INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
|
29 |
#INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/pk_nb_t5x_base_run1/checkpoint_1100000"
|
30 |
-
|
31 |
|
32 |
|
33 |
|
|
|
10 |
include "t5x/examples/t5/mt5/large.gin"
|
11 |
include "t5x/configs/runs/finetune.gin"
|
12 |
|
13 |
+
MIXTURE_OR_TASK_NAME = "angry_tweets"
|
14 |
+
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 8}
|
15 |
TRAIN_STEPS = 1_005_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
|
16 |
USE_CACHED_TASKS = False
|
17 |
DROPOUT_RATE = 0.1
|
|
|
25 |
#LOSS_NORMALIZING_FACTOR = 234496
|
26 |
|
27 |
#INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
|
28 |
+
#INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
|
29 |
#INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/pk_nb_t5x_base_run1/checkpoint_1100000"
|
30 |
+
INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_pluss200k_scandinavian_t5x_large/checkpoint_1700000"
|
31 |
|
32 |
|
33 |
|
finetune_large.sh
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
|
2 |
T5X_DIR="../../t5x" # directory where the t5x is cloned.
|
3 |
#Needs to be updated when moving to tpu-v4 it should then be in another zone
|
4 |
-
MODEL_DIR="gs://nb-t5x/
|
5 |
export PYTHONPATH=${PROJECT_DIR}
|
6 |
|
7 |
python3 ${T5X_DIR}/t5x/train.py \
|
|
|
1 |
PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
|
2 |
T5X_DIR="../../t5x" # directory where the t5x is cloned.
|
3 |
#Needs to be updated when moving to tpu-v4 it should then be in another zone
|
4 |
+
MODEL_DIR="gs://nb-t5x-us-central2/finetuned/v1_eval_angry_tweets_scandinavian_large"
|
5 |
export PYTHONPATH=${PROJECT_DIR}
|
6 |
|
7 |
python3 ${T5X_DIR}/t5x/train.py \
|
log/eval_results_t1v-n-7b23714e-w-0.jsonl
CHANGED
@@ -16,4 +16,6 @@
|
|
16 |
{"model": "gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:01:28", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 87.29016786570743, "f1_macro": 84.21729163839953}}
|
17 |
{"model": "gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:03:06", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 84.89208633093526, "f1_macro": 81.23942213621073}}
|
18 |
{"model": "gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:04:20", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 85.61151079136691, "f1_macro": 82.17948717948718}}
|
19 |
-
{"model": "gs://nb-t5x-us-central2/finetuned/
|
|
|
|
|
|
16 |
{"model": "gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:01:28", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 87.29016786570743, "f1_macro": 84.21729163839953}}
|
17 |
{"model": "gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:03:06", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 84.89208633093526, "f1_macro": 81.23942213621073}}
|
18 |
{"model": "gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:04:20", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 85.61151079136691, "f1_macro": 82.17948717948718}}
|
19 |
+
{"model": "gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 20:48:48", "split": "validation", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 85.27131782945736, "f1_macro": 82.54273504273503}}
|
20 |
+
{"model": "gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "14-04-2022 04:53:37", "split": "validation", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 85.27131782945736, "f1_macro": 82.54273504273503}}
|
21 |
+
{"model": "gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "14-04-2022 04:54:48", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 84.65227817745803, "f1_macro": 80.68438422789647}}
|
my_preprocessors.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import collections
|
2 |
+
import functools
|
3 |
+
import math
|
4 |
+
import re
|
5 |
+
from typing import Callable, Mapping, Optional, Sequence, Union
|
6 |
+
import uuid
|
7 |
+
|
8 |
+
from absl import logging
|
9 |
+
import babel
|
10 |
+
import gin
|
11 |
+
import seqio
|
12 |
+
import tensorflow.compat.v2 as tf
|
13 |
+
|
14 |
+
import json
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
# We disable no-value-for-parameter since the seqio.map_over_dataset leads to
|
18 |
+
# a false positive when seeds are provided.
|
19 |
+
# pylint:disable=no-value-for-parameter
|
20 |
+
AUTOTUNE = tf.data.experimental.AUTOTUNE
|
21 |
+
|
22 |
+
FeatureType = Mapping[str, tf.Tensor]
|
23 |
+
|
24 |
+
rekey = seqio.preprocessors.rekey
|
25 |
+
tokenize = seqio.preprocessors.tokenize
|
26 |
+
|
27 |
+
|
28 |
+
@seqio.map_over_dataset
|
29 |
+
def parse_tsv(line, field_names=None, field_delim='\t'):
|
30 |
+
"""Splits TSV lines into dict examples mapping field name to string value.
|
31 |
+
Args:
|
32 |
+
line: an example containing a comma/tab-delimited string.
|
33 |
+
field_names: a list of strings, the ordered names of the TSV fields.
|
34 |
+
Defaults to "inputs" and "targets".
|
35 |
+
field_delim: a string, the delimiter to split on e.g. ',' for csv.
|
36 |
+
Returns:
|
37 |
+
A feature dict mapping field name to string value.
|
38 |
+
"""
|
39 |
+
field_names = field_names or ['inputs', 'targets']
|
40 |
+
return dict(
|
41 |
+
zip(field_names,
|
42 |
+
tf.io.decode_csv(
|
43 |
+
line,
|
44 |
+
record_defaults=[''] * len(field_names),
|
45 |
+
field_delim=field_delim,
|
46 |
+
use_quote_delim=False)))
|
47 |
+
|
48 |
+
|
49 |
+
@seqio.map_over_dataset
|
50 |
+
def parse_json(line,field_delim='\t'):
|
51 |
+
"""Splits JSON lines into dict examples mapping.
|
52 |
+
Args:
|
53 |
+
line: an example containing valid json
|
54 |
+
Returns:
|
55 |
+
A feature dict mapping field name to string value.
|
56 |
+
"""
|
57 |
+
mydf = pd.read_json(line, lines=True)
|
58 |
+
line = mydf.to_csv(header=False, index=False,sep="\t").strip()
|
59 |
+
field_names = list(mydf.columns)
|
60 |
+
|
61 |
+
return dict(
|
62 |
+
zip(field_names,
|
63 |
+
tf.io.decode_csv(
|
64 |
+
line,
|
65 |
+
record_defaults=[''] * len(field_names),
|
66 |
+
field_delim=field_delim,
|
67 |
+
use_quote_delim=False)))
|
preprocessors.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tasks.py
CHANGED
@@ -5,7 +5,8 @@ import seqio
|
|
5 |
import my_metrics
|
6 |
import tensorflow_datasets as tfds
|
7 |
from t5.evaluation import metrics
|
8 |
-
from t5.data import preprocessors
|
|
|
9 |
import t5
|
10 |
import tensorflow.compat.v1 as tf
|
11 |
|
@@ -21,6 +22,13 @@ tsv_sentiment_path = {
|
|
21 |
"test": "gs://notram-public/finetune_datasets/norec_sentiment/test.tsv"
|
22 |
}
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
vocabulary = seqio.SentencePieceVocabulary(
|
25 |
'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
|
26 |
|
@@ -52,6 +60,25 @@ def categorise_preprocessor(ds):
|
|
52 |
return ds.map(to_inputs_and_targets,
|
53 |
num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
seqio.TaskRegistry.add(
|
57 |
"parliament",
|
@@ -87,3 +114,19 @@ seqio.TaskRegistry.add(
|
|
87 |
output_features=DEFAULT_OUTPUT_FEATURES,
|
88 |
)
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import my_metrics
|
6 |
import tensorflow_datasets as tfds
|
7 |
from t5.evaluation import metrics
|
8 |
+
#from t5.data import preprocessors
|
9 |
+
import my_preprocessors
|
10 |
import t5
|
11 |
import tensorflow.compat.v1 as tf
|
12 |
|
|
|
22 |
"test": "gs://notram-public/finetune_datasets/norec_sentiment/test.tsv"
|
23 |
}
|
24 |
|
25 |
+
json_angry_tweets_path = {
|
26 |
+
"train": "gs://notram-public/finetune_datasets/angry_tweets/train.jsonl",
|
27 |
+
"validation": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl",
|
28 |
+
"test": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl"
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
vocabulary = seqio.SentencePieceVocabulary(
|
33 |
'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
|
34 |
|
|
|
60 |
return ds.map(to_inputs_and_targets,
|
61 |
num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
62 |
|
63 |
+
def scandeval_preprocessor(ds):
|
64 |
+
def normalize_text(text):
|
65 |
+
"""Lowercase and remove quotes from a TensorFlow string."""
|
66 |
+
text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
|
67 |
+
return text
|
68 |
+
|
69 |
+
def to_inputs_and_targets(ex):
|
70 |
+
"""Map {"source": ..., "source": ...}->{"target": ..., "target": ...}."""
|
71 |
+
return {
|
72 |
+
"inputs":
|
73 |
+
tf.strings.join(
|
74 |
+
[normalize_text(ex["text"])]),
|
75 |
+
"targets":
|
76 |
+
tf.strings.join(
|
77 |
+
[normalize_text(ex["label"])]),
|
78 |
+
}
|
79 |
+
return ds.map(to_inputs_and_targets,
|
80 |
+
num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
81 |
+
|
82 |
|
83 |
seqio.TaskRegistry.add(
|
84 |
"parliament",
|
|
|
114 |
output_features=DEFAULT_OUTPUT_FEATURES,
|
115 |
)
|
116 |
|
117 |
+
seqio.TaskRegistry.add(
|
118 |
+
"angry_tweets",
|
119 |
+
source=seqio.TextLineDataSource(
|
120 |
+
split_to_filepattern=json_angry_tweets_path,
|
121 |
+
#num_input_examples=num_nq_examples
|
122 |
+
),
|
123 |
+
preprocessors=[
|
124 |
+
functools.partial(
|
125 |
+
my_preprocessors.parse_tsv),
|
126 |
+
scandeval_preprocessor,
|
127 |
+
seqio.preprocessors.tokenize_and_append_eos,
|
128 |
+
],
|
129 |
+
metric_fns=[metrics.accuracy,my_metrics.f1_macro],
|
130 |
+
output_features=DEFAULT_OUTPUT_FEATURES,
|
131 |
+
)
|
132 |
+
|