pere commited on
Commit
9258b0c
1 Parent(s): 0965e09
batch_parliament_base_ref.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
2
+ export PYTHONPATH=${PROJECT_DIR}
3
+
4
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v1_mt5x_base_1_000_000_parliament\"
5
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v2_mt5x_base_1_000_000_parliament\"
6
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v3_mt5x_base_1_000_000_parliament\"
7
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v4_mt5x_base_1_000_000_parliament\"
8
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"parliament\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v5_mt5x_base_1_000_000_parliament\"
9
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
10
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
11
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
12
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
13
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
14
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_t5x_base_1_000_000_parliament/checkpoint_1010000\"
15
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
16
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
17
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
18
+ python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_mt5x_base_1_000_000_parliament/checkpoint_1010000\"
19
+
batch_sentiment_base.sh CHANGED
@@ -1,18 +1,18 @@
1
  PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
2
  export PYTHONPATH=${PROJECT_DIR}
3
 
4
- #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
5
  python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
6
- #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
7
- #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
8
- #python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
9
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
10
- python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
11
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
12
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
13
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
14
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
15
- python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
16
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
17
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
18
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
 
1
  PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
2
  export PYTHONPATH=${PROJECT_DIR}
3
 
4
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
5
  python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
6
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
7
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
8
+ python3 ../../t5x/t5x/train.py --gin_search_paths="./" --gin_file="finetune_categorisation_base.gin" --gin.MIXTURE_OR_TASK_NAME=\"sentiment\" --gin.MODEL_DIR=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment\"
9
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
10
+ #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
11
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
12
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
13
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"validation\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
14
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
15
+ #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
16
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
17
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
18
  #python3 eval.py --gin_search_paths="./" --gin_file="eval_categorisation_base.gin" --gin.SPLIT=\"test\" --gin.CHECKPOINT_PATH=\"gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000\"
corpus/angry_tweets/test.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
corpus/angry_tweets/train.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
finetune_categorisation_base.gin CHANGED
@@ -12,8 +12,9 @@ include "t5x/configs/runs/finetune.gin"
12
 
13
  MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
  TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 2}
15
- INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
16
- TRAIN_STEPS = 1_510_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
 
17
  USE_CACHED_TASKS = False
18
  DROPOUT_RATE = 0.1
19
  RANDOM_SEED = 0
 
12
 
13
  MIXTURE_OR_TASK_NAME = %gin.REQUIRED
14
  TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 2}
15
+ #INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000"
16
+ INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
17
+ TRAIN_STEPS = 1_010_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
18
  USE_CACHED_TASKS = False
19
  DROPOUT_RATE = 0.1
20
  RANDOM_SEED = 0
finetune_categorisation_large.gin CHANGED
@@ -10,8 +10,8 @@ from t5x import utils
10
  include "t5x/examples/t5/mt5/large.gin"
11
  include "t5x/configs/runs/finetune.gin"
12
 
13
- MIXTURE_OR_TASK_NAME = "categorise"
14
- TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 2}
15
  TRAIN_STEPS = 1_005_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
16
  USE_CACHED_TASKS = False
17
  DROPOUT_RATE = 0.1
@@ -25,9 +25,9 @@ RANDOM_SEED = 0
25
  #LOSS_NORMALIZING_FACTOR = 234496
26
 
27
  #INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
28
- INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
29
  #INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/pk_nb_t5x_base_run1/checkpoint_1100000"
30
- #INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_t5x_base/checkpoint_1294000"
31
 
32
 
33
 
 
10
  include "t5x/examples/t5/mt5/large.gin"
11
  include "t5x/configs/runs/finetune.gin"
12
 
13
+ MIXTURE_OR_TASK_NAME = "angry_tweets"
14
+ TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 8}
15
  TRAIN_STEPS = 1_005_000 # 1000000 pre-trained steps + 10000 fine-tuning steps.
16
  USE_CACHED_TASKS = False
17
  DROPOUT_RATE = 0.1
 
25
  #LOSS_NORMALIZING_FACTOR = 234496
26
 
27
  #INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
28
+ #INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_large/checkpoint_1000000"
29
  #INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/pk_nb_t5x_base_run1/checkpoint_1100000"
30
+ INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_pluss200k_scandinavian_t5x_large/checkpoint_1700000"
31
 
32
 
33
 
finetune_large.sh CHANGED
@@ -1,7 +1,7 @@
1
  PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
2
  T5X_DIR="../../t5x" # directory where the t5x is cloned.
3
  #Needs to be updated when moving to tpu-v4 it should then be in another zone
4
- MODEL_DIR="gs://nb-t5x/eval_large"
5
  export PYTHONPATH=${PROJECT_DIR}
6
 
7
  python3 ${T5X_DIR}/t5x/train.py \
 
1
  PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
2
  T5X_DIR="../../t5x" # directory where the t5x is cloned.
3
  #Needs to be updated when moving to tpu-v4 it should then be in another zone
4
+ MODEL_DIR="gs://nb-t5x-us-central2/finetuned/v1_eval_angry_tweets_scandinavian_large"
5
  export PYTHONPATH=${PROJECT_DIR}
6
 
7
  python3 ${T5X_DIR}/t5x/train.py \
log/eval_results_t1v-n-7b23714e-w-0.jsonl CHANGED
@@ -16,4 +16,6 @@
16
  {"model": "gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:01:28", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 87.29016786570743, "f1_macro": 84.21729163839953}}
17
  {"model": "gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:03:06", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 84.89208633093526, "f1_macro": 81.23942213621073}}
18
  {"model": "gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:04:20", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 85.61151079136691, "f1_macro": 82.17948717948718}}
19
- {"model": "gs://nb-t5x-us-central2/finetuned/v5_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:05:39", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 80.0959232613909, "f1_macro": 74.5979905029614}}
 
 
 
16
  {"model": "gs://nb-t5x-us-central2/finetuned/v1_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:01:28", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 87.29016786570743, "f1_macro": 84.21729163839953}}
17
  {"model": "gs://nb-t5x-us-central2/finetuned/v3_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:03:06", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 84.89208633093526, "f1_macro": 81.23942213621073}}
18
  {"model": "gs://nb-t5x-us-central2/finetuned/v4_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 14:04:20", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 85.61151079136691, "f1_macro": 82.17948717948718}}
19
+ {"model": "gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "13-04-2022 20:48:48", "split": "validation", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 85.27131782945736, "f1_macro": 82.54273504273503}}
20
+ {"model": "gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "14-04-2022 04:53:37", "split": "validation", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 85.27131782945736, "f1_macro": 82.54273504273503}}
21
+ {"model": "gs://nb-t5x-us-central2/finetuned/v2_norwegian_NCC_plus_English_t5x_base_1_500_000_sentiment/checkpoint_1510000", "task": "sentiment", "eval_date": "14-04-2022 04:54:48", "split": "test", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 16, "result": {"accuracy": 84.65227817745803, "f1_macro": 80.68438422789647}}
my_preprocessors.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import functools
3
+ import math
4
+ import re
5
+ from typing import Callable, Mapping, Optional, Sequence, Union
6
+ import uuid
7
+
8
+ from absl import logging
9
+ import babel
10
+ import gin
11
+ import seqio
12
+ import tensorflow.compat.v2 as tf
13
+
14
+ import json
15
+ import pandas as pd
16
+
17
+ # We disable no-value-for-parameter since the seqio.map_over_dataset leads to
18
+ # a false positive when seeds are provided.
19
+ # pylint:disable=no-value-for-parameter
20
+ AUTOTUNE = tf.data.experimental.AUTOTUNE
21
+
22
+ FeatureType = Mapping[str, tf.Tensor]
23
+
24
+ rekey = seqio.preprocessors.rekey
25
+ tokenize = seqio.preprocessors.tokenize
26
+
27
+
28
+ @seqio.map_over_dataset
29
+ def parse_tsv(line, field_names=None, field_delim='\t'):
30
+ """Splits TSV lines into dict examples mapping field name to string value.
31
+ Args:
32
+ line: an example containing a comma/tab-delimited string.
33
+ field_names: a list of strings, the ordered names of the TSV fields.
34
+ Defaults to "inputs" and "targets".
35
+ field_delim: a string, the delimiter to split on e.g. ',' for csv.
36
+ Returns:
37
+ A feature dict mapping field name to string value.
38
+ """
39
+ field_names = field_names or ['inputs', 'targets']
40
+ return dict(
41
+ zip(field_names,
42
+ tf.io.decode_csv(
43
+ line,
44
+ record_defaults=[''] * len(field_names),
45
+ field_delim=field_delim,
46
+ use_quote_delim=False)))
47
+
48
+
49
+ @seqio.map_over_dataset
50
+ def parse_json(line,field_delim='\t'):
51
+ """Splits JSON lines into dict examples mapping.
52
+ Args:
53
+ line: an example containing valid json
54
+ Returns:
55
+ A feature dict mapping field name to string value.
56
+ """
57
+ mydf = pd.read_json(line, lines=True)
58
+ line = mydf.to_csv(header=False, index=False,sep="\t").strip()
59
+ field_names = list(mydf.columns)
60
+
61
+ return dict(
62
+ zip(field_names,
63
+ tf.io.decode_csv(
64
+ line,
65
+ record_defaults=[''] * len(field_names),
66
+ field_delim=field_delim,
67
+ use_quote_delim=False)))
preprocessors.py ADDED
The diff for this file is too large to render. See raw diff
 
tasks.py CHANGED
@@ -5,7 +5,8 @@ import seqio
5
  import my_metrics
6
  import tensorflow_datasets as tfds
7
  from t5.evaluation import metrics
8
- from t5.data import preprocessors
 
9
  import t5
10
  import tensorflow.compat.v1 as tf
11
 
@@ -21,6 +22,13 @@ tsv_sentiment_path = {
21
  "test": "gs://notram-public/finetune_datasets/norec_sentiment/test.tsv"
22
  }
23
 
 
 
 
 
 
 
 
24
  vocabulary = seqio.SentencePieceVocabulary(
25
  'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
26
 
@@ -52,6 +60,25 @@ def categorise_preprocessor(ds):
52
  return ds.map(to_inputs_and_targets,
53
  num_parallel_calls=tf.data.experimental.AUTOTUNE)
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  seqio.TaskRegistry.add(
57
  "parliament",
@@ -87,3 +114,19 @@ seqio.TaskRegistry.add(
87
  output_features=DEFAULT_OUTPUT_FEATURES,
88
  )
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import my_metrics
6
  import tensorflow_datasets as tfds
7
  from t5.evaluation import metrics
8
+ #from t5.data import preprocessors
9
+ import my_preprocessors
10
  import t5
11
  import tensorflow.compat.v1 as tf
12
 
 
22
  "test": "gs://notram-public/finetune_datasets/norec_sentiment/test.tsv"
23
  }
24
 
25
+ json_angry_tweets_path = {
26
+ "train": "gs://notram-public/finetune_datasets/angry_tweets/train.jsonl",
27
+ "validation": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl",
28
+ "test": "gs://notram-public/finetune_datasets/angry_tweets/test.jsonl"
29
+ }
30
+
31
+
32
  vocabulary = seqio.SentencePieceVocabulary(
33
  'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)
34
 
 
60
  return ds.map(to_inputs_and_targets,
61
  num_parallel_calls=tf.data.experimental.AUTOTUNE)
62
 
63
+ def scandeval_preprocessor(ds):
64
+ def normalize_text(text):
65
+ """Lowercase and remove quotes from a TensorFlow string."""
66
+ text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
67
+ return text
68
+
69
+ def to_inputs_and_targets(ex):
70
+ """Map {"source": ..., "source": ...}->{"target": ..., "target": ...}."""
71
+ return {
72
+ "inputs":
73
+ tf.strings.join(
74
+ [normalize_text(ex["text"])]),
75
+ "targets":
76
+ tf.strings.join(
77
+ [normalize_text(ex["label"])]),
78
+ }
79
+ return ds.map(to_inputs_and_targets,
80
+ num_parallel_calls=tf.data.experimental.AUTOTUNE)
81
+
82
 
83
  seqio.TaskRegistry.add(
84
  "parliament",
 
114
  output_features=DEFAULT_OUTPUT_FEATURES,
115
  )
116
 
117
+ seqio.TaskRegistry.add(
118
+ "angry_tweets",
119
+ source=seqio.TextLineDataSource(
120
+ split_to_filepattern=json_angry_tweets_path,
121
+ #num_input_examples=num_nq_examples
122
+ ),
123
+ preprocessors=[
124
+ functools.partial(
125
+ my_preprocessors.parse_tsv),
126
+ scandeval_preprocessor,
127
+ seqio.preprocessors.tokenize_and_append_eos,
128
+ ],
129
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro],
130
+ output_features=DEFAULT_OUTPUT_FEATURES,
131
+ )
132
+