pere commited on
Commit
81de315
1 Parent(s): dd620f6

eval updated

Browse files
__pycache__/my_metrics.cpython-38.pyc ADDED
Binary file (464 Bytes). View file
 
__pycache__/tasks.cpython-38.pyc CHANGED
Binary files a/__pycache__/tasks.cpython-38.pyc and b/__pycache__/tasks.cpython-38.pyc differ
 
eval.py CHANGED
@@ -181,15 +181,15 @@ def evaluate(
181
  now = datetime.now()
182
  logtime = now.strftime("%d-%m-%Y %H:%M:%S")
183
 
184
- if not os.path.exists("log"):
185
- os.makedirs("log")
186
-
187
- logname ="./log/"+"eval_results_"+socket.gethostname()+".jsonl"
188
 
189
  output = {}
190
  output["model"] = restore_checkpoint_cfg.path
 
191
  output["eval_date"] = logtime
192
  output["split"] = dataset_cfg.split
 
 
193
  output["result"] = all_metrics.result()[dataset_cfg.mixture_or_task_name]
194
 
195
  with jsonlines.open(logname, mode="a") as writer:
 
181
  now = datetime.now()
182
  logtime = now.strftime("%d-%m-%Y %H:%M:%S")
183
 
184
+ logname = output_dir+"eval_results_"+socket.gethostname()+".jsonl"
 
 
 
185
 
186
  output = {}
187
  output["model"] = restore_checkpoint_cfg.path
188
+ output["task"] = dataset_cfg.mixture_or_task_name
189
  output["eval_date"] = logtime
190
  output["split"] = dataset_cfg.split
191
+ output["feature_length"] = dataset_cfg.task_feature_lengths
192
+ output["eval_batch_size"] = dataset_cfg.batch_size
193
  output["result"] = all_metrics.result()[dataset_cfg.mixture_or_task_name]
194
 
195
  with jsonlines.open(logname, mode="a") as writer:
eval_base.sh CHANGED
@@ -1,5 +1,5 @@
1
  PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
2
- EVAL_OUTPUT_DIR="gs://nb-t5x/eval/"
3
  T5X_DIR="../../t5x" # directory where the t5x is cloned.
4
  CHECKPOINT_PATH="gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000"
5
  export PYTHONPATH=${PROJECT_DIR}
@@ -8,4 +8,4 @@ python3 eval.py \
8
  --gin_search_paths=${PROJECT_DIR} \
9
  --gin_file="eval_categorisation_base.gin" \
10
  --gin.CHECKPOINT_PATH=\"${CHECKPOINT_PATH}\" \
11
- --gin.EVAL_OUTPUT_DIR=\"${EVAL_OUTPUT_DIR}\" \
 
1
  PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
2
+ #EVAL_OUTPUT_DIR="gs://nb-t5x/eval/"
3
  T5X_DIR="../../t5x" # directory where the t5x is cloned.
4
  CHECKPOINT_PATH="gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000"
5
  export PYTHONPATH=${PROJECT_DIR}
 
8
  --gin_search_paths=${PROJECT_DIR} \
9
  --gin_file="eval_categorisation_base.gin" \
10
  --gin.CHECKPOINT_PATH=\"${CHECKPOINT_PATH}\" \
11
+ # --gin.EVAL_OUTPUT_DIR=\"${EVAL_OUTPUT_DIR}\" \
eval_categorisation_base.gin CHANGED
@@ -9,7 +9,7 @@ from t5x import utils
9
  include "t5x/examples/t5/mt5/base.gin"
10
 
11
  CHECKPOINT_PATH = %gin.REQUIRED # passed via commandline
12
- EVAL_OUTPUT_DIR = %gin.REQUIRED # passed via commandline
13
 
14
  DROPOUT_RATE = 0.0 # unused boilerplate
15
  MIXTURE_OR_TASK_NAME = "categorise"
 
9
  include "t5x/examples/t5/mt5/base.gin"
10
 
11
  CHECKPOINT_PATH = %gin.REQUIRED # passed via commandline
12
+ EVAL_OUTPUT_DIR = "./log/"
13
 
14
  DROPOUT_RATE = 0.0 # unused boilerplate
15
  MIXTURE_OR_TASK_NAME = "categorise"
finetune_categorisation_base.gin CHANGED
@@ -18,8 +18,8 @@ DROPOUT_RATE = 0.1
18
  RANDOM_SEED = 0
19
 
20
  #Fixing a small error
21
- infer_eval/utils.DatasetConfig.task_feature_lengths = TASK_FEATURE_LENGTHS
22
-
23
 
24
  # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
25
  # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
 
18
  RANDOM_SEED = 0
19
 
20
  #Fixing a small error
21
+ infer_eval/utils.DatasetConfig:
22
+ task_feature_lengths = %TASK_FEATURE_LENGTHS
23
 
24
  # Pere: Only necessary if we load a t5 model. We can start with an t5x model here
25
  # `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
log/config.gin ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ import __main__ as eval_script
3
+ import seqio
4
+ from t5.data import mixtures
5
+ from t5x import adafactor
6
+ from t5x.examples.t5 import network
7
+ from t5x import models
8
+ from t5x import partitioning
9
+ from t5x import utils
10
+ import tasks
11
+
12
+ # Macros:
13
+ # ==============================================================================
14
+ CHECKPOINT_PATH = 'gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000'
15
+ DROPOUT_RATE = 0.0
16
+ EVAL_OUTPUT_DIR = './log/'
17
+ LABEL_SMOOTHING = 0.0
18
+ LOSS_NORMALIZING_FACTOR = None
19
+ MIXTURE_OR_TASK_NAME = 'categorise'
20
+ MODEL = @models.EncoderDecoderModel()
21
+ OPTIMIZER = @adafactor.Adafactor()
22
+ VOCABULARY = @seqio.SentencePieceVocabulary()
23
+ Z_LOSS = 0.0001
24
+
25
+ # Parameters for adafactor.Adafactor:
26
+ # ==============================================================================
27
+ adafactor.Adafactor.decay_rate = 0.8
28
+ adafactor.Adafactor.logical_factor_rules = \
29
+ @adafactor.standard_logical_factor_rules()
30
+ adafactor.Adafactor.step_offset = 0
31
+
32
+ # Parameters for utils.DatasetConfig:
33
+ # ==============================================================================
34
+ utils.DatasetConfig.batch_size = 32
35
+ utils.DatasetConfig.mixture_or_task_name = %MIXTURE_OR_TASK_NAME
36
+ utils.DatasetConfig.seed = 42
37
+ utils.DatasetConfig.shuffle = False
38
+ utils.DatasetConfig.split = 'validation'
39
+ utils.DatasetConfig.task_feature_lengths = {'inputs': 512, 'targets': 2}
40
+
41
+ # Parameters for models.EncoderDecoderModel:
42
+ # ==============================================================================
43
+ models.EncoderDecoderModel.input_vocabulary = %VOCABULARY
44
+ models.EncoderDecoderModel.label_smoothing = %LABEL_SMOOTHING
45
+ models.EncoderDecoderModel.loss_normalizing_factor = %LOSS_NORMALIZING_FACTOR
46
+ models.EncoderDecoderModel.module = @network.Transformer()
47
+ models.EncoderDecoderModel.optimizer_def = %OPTIMIZER
48
+ models.EncoderDecoderModel.output_vocabulary = %VOCABULARY
49
+ models.EncoderDecoderModel.z_loss = %Z_LOSS
50
+
51
+ # Parameters for eval_script.evaluate:
52
+ # ==============================================================================
53
+ eval_script.evaluate.dataset_cfg = @utils.DatasetConfig()
54
+ eval_script.evaluate.model = %MODEL
55
+ eval_script.evaluate.output_dir = %EVAL_OUTPUT_DIR
56
+ eval_script.evaluate.partitioner = @partitioning.PjitPartitioner()
57
+ eval_script.evaluate.restore_checkpoint_cfg = @utils.RestoreCheckpointConfig()
58
+
59
+ # Parameters for partitioning.PjitPartitioner:
60
+ # ==============================================================================
61
+ partitioning.PjitPartitioner.num_partitions = 2
62
+
63
+ # Parameters for utils.RestoreCheckpointConfig:
64
+ # ==============================================================================
65
+ utils.RestoreCheckpointConfig.mode = 'specific'
66
+ utils.RestoreCheckpointConfig.path = %CHECKPOINT_PATH
67
+
68
+ # Parameters for seqio.SentencePieceVocabulary:
69
+ # ==============================================================================
70
+ seqio.SentencePieceVocabulary.sentencepiece_model_file = \
71
+ 'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model'
72
+
73
+ # Parameters for network.T5Config:
74
+ # ==============================================================================
75
+ network.T5Config.dropout_rate = %DROPOUT_RATE
76
+ network.T5Config.dtype = 'bfloat16'
77
+ network.T5Config.emb_dim = 768
78
+ network.T5Config.head_dim = 64
79
+ network.T5Config.logits_via_embedding = False
80
+ network.T5Config.mlp_activations = ('gelu', 'linear')
81
+ network.T5Config.mlp_dim = 2048
82
+ network.T5Config.num_decoder_layers = 12
83
+ network.T5Config.num_encoder_layers = 12
84
+ network.T5Config.num_heads = 12
85
+ network.T5Config.vocab_size = 250112
86
+
87
+ # Parameters for network.Transformer:
88
+ # ==============================================================================
89
+ network.Transformer.config = @network.T5Config()
log/eval_results_t1v-n-7b23714e-w-0.jsonl CHANGED
@@ -3,3 +3,5 @@
3
  {"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 17:40:27", "task": "categorise", "feature_length": null, "split": "validation", "result": {"accuracy": 86.33333333333333, "f1_macro": 86.33090327169275}}
4
  {"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 18:07:14", "task": "categorise", "feature_length": null, "split": "validation", "result": {"accuracy": 86.33333333333333, "f1_macro": 86.33090327169275}}
5
  {"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 18:31:25", "task": "categorise", "feature_length": {"inputs": 512, "targets": 2}, "split": "validation", "result": {"accuracy": 84.83333333333334, "f1_macro": 84.82911919977771}}
 
 
 
3
  {"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 17:40:27", "task": "categorise", "feature_length": null, "split": "validation", "result": {"accuracy": 86.33333333333333, "f1_macro": 86.33090327169275}}
4
  {"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 18:07:14", "task": "categorise", "feature_length": null, "split": "validation", "result": {"accuracy": 86.33333333333333, "f1_macro": 86.33090327169275}}
5
  {"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 18:31:25", "task": "categorise", "feature_length": {"inputs": 512, "targets": 2}, "split": "validation", "result": {"accuracy": 84.83333333333334, "f1_macro": 84.82911919977771}}
6
+ {"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "11-04-2022 06:48:47", "split": "validation", "result": {"accuracy": 84.83333333333334}}
7
+ {"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "11-04-2022 07:01:50", "split": "validation", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 32, "result": {"accuracy": 84.83333333333334, "f1_macro": 84.82911919977771}}
log/model-info.txt ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Variable decoder/decoder_norm/scale size 768 shape (embed=768) partition spec (None,)
2
+ Variable decoder/layers_0/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
3
+ Variable decoder/layers_0/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
4
+ Variable decoder/layers_0/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
5
+ Variable decoder/layers_0/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
6
+ Variable decoder/layers_0/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
7
+ Variable decoder/layers_0/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
8
+ Variable decoder/layers_0/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
9
+ Variable decoder/layers_0/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
10
+ Variable decoder/layers_0/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
11
+ Variable decoder/layers_0/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
12
+ Variable decoder/layers_0/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
13
+ Variable decoder/layers_0/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
14
+ Variable decoder/layers_0/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
15
+ Variable decoder/layers_0/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
16
+ Variable decoder/layers_1/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
17
+ Variable decoder/layers_1/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
18
+ Variable decoder/layers_1/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
19
+ Variable decoder/layers_1/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
20
+ Variable decoder/layers_1/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
21
+ Variable decoder/layers_1/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
22
+ Variable decoder/layers_1/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
23
+ Variable decoder/layers_1/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
24
+ Variable decoder/layers_1/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
25
+ Variable decoder/layers_1/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
26
+ Variable decoder/layers_1/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
27
+ Variable decoder/layers_1/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
28
+ Variable decoder/layers_1/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
29
+ Variable decoder/layers_1/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
30
+ Variable decoder/layers_10/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
31
+ Variable decoder/layers_10/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
32
+ Variable decoder/layers_10/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
33
+ Variable decoder/layers_10/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
34
+ Variable decoder/layers_10/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
35
+ Variable decoder/layers_10/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
36
+ Variable decoder/layers_10/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
37
+ Variable decoder/layers_10/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
38
+ Variable decoder/layers_10/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
39
+ Variable decoder/layers_10/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
40
+ Variable decoder/layers_10/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
41
+ Variable decoder/layers_10/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
42
+ Variable decoder/layers_10/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
43
+ Variable decoder/layers_10/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
44
+ Variable decoder/layers_11/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
45
+ Variable decoder/layers_11/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
46
+ Variable decoder/layers_11/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
47
+ Variable decoder/layers_11/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
48
+ Variable decoder/layers_11/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
49
+ Variable decoder/layers_11/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
50
+ Variable decoder/layers_11/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
51
+ Variable decoder/layers_11/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
52
+ Variable decoder/layers_11/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
53
+ Variable decoder/layers_11/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
54
+ Variable decoder/layers_11/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
55
+ Variable decoder/layers_11/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
56
+ Variable decoder/layers_11/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
57
+ Variable decoder/layers_11/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
58
+ Variable decoder/layers_2/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
59
+ Variable decoder/layers_2/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
60
+ Variable decoder/layers_2/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
61
+ Variable decoder/layers_2/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
62
+ Variable decoder/layers_2/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
63
+ Variable decoder/layers_2/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
64
+ Variable decoder/layers_2/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
65
+ Variable decoder/layers_2/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
66
+ Variable decoder/layers_2/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
67
+ Variable decoder/layers_2/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
68
+ Variable decoder/layers_2/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
69
+ Variable decoder/layers_2/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
70
+ Variable decoder/layers_2/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
71
+ Variable decoder/layers_2/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
72
+ Variable decoder/layers_3/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
73
+ Variable decoder/layers_3/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
74
+ Variable decoder/layers_3/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
75
+ Variable decoder/layers_3/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
76
+ Variable decoder/layers_3/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
77
+ Variable decoder/layers_3/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
78
+ Variable decoder/layers_3/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
79
+ Variable decoder/layers_3/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
80
+ Variable decoder/layers_3/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
81
+ Variable decoder/layers_3/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
82
+ Variable decoder/layers_3/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
83
+ Variable decoder/layers_3/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
84
+ Variable decoder/layers_3/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
85
+ Variable decoder/layers_3/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
86
+ Variable decoder/layers_4/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
87
+ Variable decoder/layers_4/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
88
+ Variable decoder/layers_4/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
89
+ Variable decoder/layers_4/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
90
+ Variable decoder/layers_4/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
91
+ Variable decoder/layers_4/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
92
+ Variable decoder/layers_4/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
93
+ Variable decoder/layers_4/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
94
+ Variable decoder/layers_4/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
95
+ Variable decoder/layers_4/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
96
+ Variable decoder/layers_4/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
97
+ Variable decoder/layers_4/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
98
+ Variable decoder/layers_4/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
99
+ Variable decoder/layers_4/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
100
+ Variable decoder/layers_5/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
101
+ Variable decoder/layers_5/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
102
+ Variable decoder/layers_5/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
103
+ Variable decoder/layers_5/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
104
+ Variable decoder/layers_5/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
105
+ Variable decoder/layers_5/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
106
+ Variable decoder/layers_5/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
107
+ Variable decoder/layers_5/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
108
+ Variable decoder/layers_5/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
109
+ Variable decoder/layers_5/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
110
+ Variable decoder/layers_5/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
111
+ Variable decoder/layers_5/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
112
+ Variable decoder/layers_5/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
113
+ Variable decoder/layers_5/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
114
+ Variable decoder/layers_6/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
115
+ Variable decoder/layers_6/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
116
+ Variable decoder/layers_6/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
117
+ Variable decoder/layers_6/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
118
+ Variable decoder/layers_6/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
119
+ Variable decoder/layers_6/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
120
+ Variable decoder/layers_6/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
121
+ Variable decoder/layers_6/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
122
+ Variable decoder/layers_6/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
123
+ Variable decoder/layers_6/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
124
+ Variable decoder/layers_6/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
125
+ Variable decoder/layers_6/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
126
+ Variable decoder/layers_6/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
127
+ Variable decoder/layers_6/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
128
+ Variable decoder/layers_7/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
129
+ Variable decoder/layers_7/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
130
+ Variable decoder/layers_7/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
131
+ Variable decoder/layers_7/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
132
+ Variable decoder/layers_7/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
133
+ Variable decoder/layers_7/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
134
+ Variable decoder/layers_7/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
135
+ Variable decoder/layers_7/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
136
+ Variable decoder/layers_7/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
137
+ Variable decoder/layers_7/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
138
+ Variable decoder/layers_7/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
139
+ Variable decoder/layers_7/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
140
+ Variable decoder/layers_7/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
141
+ Variable decoder/layers_7/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
142
+ Variable decoder/layers_8/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
143
+ Variable decoder/layers_8/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
144
+ Variable decoder/layers_8/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
145
+ Variable decoder/layers_8/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
146
+ Variable decoder/layers_8/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
147
+ Variable decoder/layers_8/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
148
+ Variable decoder/layers_8/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
149
+ Variable decoder/layers_8/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
150
+ Variable decoder/layers_8/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
151
+ Variable decoder/layers_8/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
152
+ Variable decoder/layers_8/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
153
+ Variable decoder/layers_8/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
154
+ Variable decoder/layers_8/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
155
+ Variable decoder/layers_8/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
156
+ Variable decoder/layers_9/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
157
+ Variable decoder/layers_9/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
158
+ Variable decoder/layers_9/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
159
+ Variable decoder/layers_9/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
160
+ Variable decoder/layers_9/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
161
+ Variable decoder/layers_9/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
162
+ Variable decoder/layers_9/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
163
+ Variable decoder/layers_9/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
164
+ Variable decoder/layers_9/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
165
+ Variable decoder/layers_9/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
166
+ Variable decoder/layers_9/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
167
+ Variable decoder/layers_9/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
168
+ Variable decoder/layers_9/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
169
+ Variable decoder/layers_9/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
170
+ Variable decoder/logits_dense/kernel size 192086016 shape (embed=768, vocab=250112) partition spec (None, 'model')
171
+ Variable decoder/relpos_bias/rel_embedding size 384 shape (heads=12, relpos_buckets=32) partition spec ('model', None)
172
+ Variable encoder/encoder_norm/scale size 768 shape (embed=768) partition spec (None,)
173
+ Variable encoder/layers_0/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
174
+ Variable encoder/layers_0/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
175
+ Variable encoder/layers_0/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
176
+ Variable encoder/layers_0/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
177
+ Variable encoder/layers_0/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
178
+ Variable encoder/layers_0/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
179
+ Variable encoder/layers_0/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
180
+ Variable encoder/layers_0/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
181
+ Variable encoder/layers_0/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
182
+ Variable encoder/layers_1/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
183
+ Variable encoder/layers_1/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
184
+ Variable encoder/layers_1/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
185
+ Variable encoder/layers_1/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
186
+ Variable encoder/layers_1/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
187
+ Variable encoder/layers_1/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
188
+ Variable encoder/layers_1/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
189
+ Variable encoder/layers_1/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
190
+ Variable encoder/layers_1/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
191
+ Variable encoder/layers_10/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
192
+ Variable encoder/layers_10/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
193
+ Variable encoder/layers_10/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
194
+ Variable encoder/layers_10/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
195
+ Variable encoder/layers_10/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
196
+ Variable encoder/layers_10/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
197
+ Variable encoder/layers_10/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
198
+ Variable encoder/layers_10/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
199
+ Variable encoder/layers_10/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
200
+ Variable encoder/layers_11/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
201
+ Variable encoder/layers_11/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
202
+ Variable encoder/layers_11/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
203
+ Variable encoder/layers_11/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
204
+ Variable encoder/layers_11/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
205
+ Variable encoder/layers_11/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
206
+ Variable encoder/layers_11/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
207
+ Variable encoder/layers_11/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
208
+ Variable encoder/layers_11/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
209
+ Variable encoder/layers_2/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
210
+ Variable encoder/layers_2/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
211
+ Variable encoder/layers_2/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
212
+ Variable encoder/layers_2/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
213
+ Variable encoder/layers_2/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
214
+ Variable encoder/layers_2/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
215
+ Variable encoder/layers_2/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
216
+ Variable encoder/layers_2/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
217
+ Variable encoder/layers_2/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
218
+ Variable encoder/layers_3/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
219
+ Variable encoder/layers_3/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
220
+ Variable encoder/layers_3/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
221
+ Variable encoder/layers_3/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
222
+ Variable encoder/layers_3/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
223
+ Variable encoder/layers_3/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
224
+ Variable encoder/layers_3/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
225
+ Variable encoder/layers_3/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
226
+ Variable encoder/layers_3/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
227
+ Variable encoder/layers_4/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
228
+ Variable encoder/layers_4/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
229
+ Variable encoder/layers_4/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
230
+ Variable encoder/layers_4/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
231
+ Variable encoder/layers_4/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
232
+ Variable encoder/layers_4/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
233
+ Variable encoder/layers_4/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
234
+ Variable encoder/layers_4/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
235
+ Variable encoder/layers_4/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
236
+ Variable encoder/layers_5/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
237
+ Variable encoder/layers_5/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
238
+ Variable encoder/layers_5/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
239
+ Variable encoder/layers_5/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
240
+ Variable encoder/layers_5/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
241
+ Variable encoder/layers_5/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
242
+ Variable encoder/layers_5/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
243
+ Variable encoder/layers_5/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
244
+ Variable encoder/layers_5/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
245
+ Variable encoder/layers_6/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
246
+ Variable encoder/layers_6/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
247
+ Variable encoder/layers_6/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
248
+ Variable encoder/layers_6/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
249
+ Variable encoder/layers_6/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
250
+ Variable encoder/layers_6/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
251
+ Variable encoder/layers_6/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
252
+ Variable encoder/layers_6/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
253
+ Variable encoder/layers_6/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
254
+ Variable encoder/layers_7/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
255
+ Variable encoder/layers_7/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
256
+ Variable encoder/layers_7/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
257
+ Variable encoder/layers_7/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
258
+ Variable encoder/layers_7/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
259
+ Variable encoder/layers_7/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
260
+ Variable encoder/layers_7/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
261
+ Variable encoder/layers_7/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
262
+ Variable encoder/layers_7/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
263
+ Variable encoder/layers_8/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
264
+ Variable encoder/layers_8/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
265
+ Variable encoder/layers_8/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
266
+ Variable encoder/layers_8/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
267
+ Variable encoder/layers_8/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
268
+ Variable encoder/layers_8/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
269
+ Variable encoder/layers_8/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
270
+ Variable encoder/layers_8/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
271
+ Variable encoder/layers_8/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
272
+ Variable encoder/layers_9/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
273
+ Variable encoder/layers_9/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
274
+ Variable encoder/layers_9/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
275
+ Variable encoder/layers_9/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
276
+ Variable encoder/layers_9/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
277
+ Variable encoder/layers_9/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
278
+ Variable encoder/layers_9/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
279
+ Variable encoder/layers_9/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
280
+ Variable encoder/layers_9/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
281
+ Variable encoder/relpos_bias/rel_embedding size 384 shape (heads=12, relpos_buckets=32) partition spec ('model', None)
282
+ Variable token_embedder/embedding size 192086016 shape (vocab=250112, embed=768) partition spec ('model', None)
283
+ Total number of parameters: 582401280
284
+
285
+ Variable step size 1 shape () partition spec None
tasks.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import functools
4
  import seqio
 
5
  import tensorflow_datasets as tfds
6
  from t5.evaluation import metrics
7
  from t5.data import preprocessors
@@ -59,7 +60,7 @@ seqio.TaskRegistry.add(
59
  categorise_preprocessor,
60
  seqio.preprocessors.tokenize_and_append_eos,
61
  ],
62
- metric_fns=[metrics.accuracy],
63
  output_features=DEFAULT_OUTPUT_FEATURES,
64
  )
65
 
 
2
 
3
  import functools
4
  import seqio
5
+ import my_metrics
6
  import tensorflow_datasets as tfds
7
  from t5.evaluation import metrics
8
  from t5.data import preprocessors
 
60
  categorise_preprocessor,
61
  seqio.preprocessors.tokenize_and_append_eos,
62
  ],
63
+ metric_fns=[metrics.accuracy,my_metrics.f1_macro],
64
  output_features=DEFAULT_OUTPUT_FEATURES,
65
  )
66