eval updated
Browse files- __pycache__/my_metrics.cpython-38.pyc +0 -0
- __pycache__/tasks.cpython-38.pyc +0 -0
- eval.py +4 -4
- eval_base.sh +2 -2
- eval_categorisation_base.gin +1 -1
- finetune_categorisation_base.gin +2 -2
- log/config.gin +89 -0
- log/eval_results_t1v-n-7b23714e-w-0.jsonl +2 -0
- log/model-info.txt +285 -0
- tasks.py +2 -1
__pycache__/my_metrics.cpython-38.pyc
ADDED
Binary file (464 Bytes). View file
|
|
__pycache__/tasks.cpython-38.pyc
CHANGED
Binary files a/__pycache__/tasks.cpython-38.pyc and b/__pycache__/tasks.cpython-38.pyc differ
|
|
eval.py
CHANGED
@@ -181,15 +181,15 @@ def evaluate(
|
|
181 |
now = datetime.now()
|
182 |
logtime = now.strftime("%d-%m-%Y %H:%M:%S")
|
183 |
|
184 |
-
|
185 |
-
os.makedirs("log")
|
186 |
-
|
187 |
-
logname ="./log/"+"eval_results_"+socket.gethostname()+".jsonl"
|
188 |
|
189 |
output = {}
|
190 |
output["model"] = restore_checkpoint_cfg.path
|
|
|
191 |
output["eval_date"] = logtime
|
192 |
output["split"] = dataset_cfg.split
|
|
|
|
|
193 |
output["result"] = all_metrics.result()[dataset_cfg.mixture_or_task_name]
|
194 |
|
195 |
with jsonlines.open(logname, mode="a") as writer:
|
|
|
181 |
now = datetime.now()
|
182 |
logtime = now.strftime("%d-%m-%Y %H:%M:%S")
|
183 |
|
184 |
+
logname = output_dir+"eval_results_"+socket.gethostname()+".jsonl"
|
|
|
|
|
|
|
185 |
|
186 |
output = {}
|
187 |
output["model"] = restore_checkpoint_cfg.path
|
188 |
+
output["task"] = dataset_cfg.mixture_or_task_name
|
189 |
output["eval_date"] = logtime
|
190 |
output["split"] = dataset_cfg.split
|
191 |
+
output["feature_length"] = dataset_cfg.task_feature_lengths
|
192 |
+
output["eval_batch_size"] = dataset_cfg.batch_size
|
193 |
output["result"] = all_metrics.result()[dataset_cfg.mixture_or_task_name]
|
194 |
|
195 |
with jsonlines.open(logname, mode="a") as writer:
|
eval_base.sh
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
|
2 |
-
EVAL_OUTPUT_DIR="gs://nb-t5x/eval/"
|
3 |
T5X_DIR="../../t5x" # directory where the t5x is cloned.
|
4 |
CHECKPOINT_PATH="gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000"
|
5 |
export PYTHONPATH=${PROJECT_DIR}
|
@@ -8,4 +8,4 @@ python3 eval.py \
|
|
8 |
--gin_search_paths=${PROJECT_DIR} \
|
9 |
--gin_file="eval_categorisation_base.gin" \
|
10 |
--gin.CHECKPOINT_PATH=\"${CHECKPOINT_PATH}\" \
|
11 |
-
--gin.EVAL_OUTPUT_DIR=\"${EVAL_OUTPUT_DIR}\" \
|
|
|
1 |
PROJECT_DIR=${HOME}"/models/t5-parliament-categorisation"
|
2 |
+
#EVAL_OUTPUT_DIR="gs://nb-t5x/eval/"
|
3 |
T5X_DIR="../../t5x" # directory where the t5x is cloned.
|
4 |
CHECKPOINT_PATH="gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000"
|
5 |
export PYTHONPATH=${PROJECT_DIR}
|
|
|
8 |
--gin_search_paths=${PROJECT_DIR} \
|
9 |
--gin_file="eval_categorisation_base.gin" \
|
10 |
--gin.CHECKPOINT_PATH=\"${CHECKPOINT_PATH}\" \
|
11 |
+
# --gin.EVAL_OUTPUT_DIR=\"${EVAL_OUTPUT_DIR}\" \
|
eval_categorisation_base.gin
CHANGED
@@ -9,7 +9,7 @@ from t5x import utils
|
|
9 |
include "t5x/examples/t5/mt5/base.gin"
|
10 |
|
11 |
CHECKPOINT_PATH = %gin.REQUIRED # passed via commandline
|
12 |
-
EVAL_OUTPUT_DIR =
|
13 |
|
14 |
DROPOUT_RATE = 0.0 # unused boilerplate
|
15 |
MIXTURE_OR_TASK_NAME = "categorise"
|
|
|
9 |
include "t5x/examples/t5/mt5/base.gin"
|
10 |
|
11 |
CHECKPOINT_PATH = %gin.REQUIRED # passed via commandline
|
12 |
+
EVAL_OUTPUT_DIR = "./log/"
|
13 |
|
14 |
DROPOUT_RATE = 0.0 # unused boilerplate
|
15 |
MIXTURE_OR_TASK_NAME = "categorise"
|
finetune_categorisation_base.gin
CHANGED
@@ -18,8 +18,8 @@ DROPOUT_RATE = 0.1
|
|
18 |
RANDOM_SEED = 0
|
19 |
|
20 |
#Fixing a small error
|
21 |
-
infer_eval/utils.DatasetConfig
|
22 |
-
|
23 |
|
24 |
# Pere: Only necessary if we load a t5 model. We can start with an t5x model here
|
25 |
# `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
|
|
|
18 |
RANDOM_SEED = 0
|
19 |
|
20 |
#Fixing a small error
|
21 |
+
infer_eval/utils.DatasetConfig:
|
22 |
+
task_feature_lengths = %TASK_FEATURE_LENGTHS
|
23 |
|
24 |
# Pere: Only necessary if we load a t5 model. We can start with an t5x model here
|
25 |
# `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
|
log/config.gin
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __gin__ import dynamic_registration
|
2 |
+
import __main__ as eval_script
|
3 |
+
import seqio
|
4 |
+
from t5.data import mixtures
|
5 |
+
from t5x import adafactor
|
6 |
+
from t5x.examples.t5 import network
|
7 |
+
from t5x import models
|
8 |
+
from t5x import partitioning
|
9 |
+
from t5x import utils
|
10 |
+
import tasks
|
11 |
+
|
12 |
+
# Macros:
|
13 |
+
# ==============================================================================
|
14 |
+
CHECKPOINT_PATH = 'gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000'
|
15 |
+
DROPOUT_RATE = 0.0
|
16 |
+
EVAL_OUTPUT_DIR = './log/'
|
17 |
+
LABEL_SMOOTHING = 0.0
|
18 |
+
LOSS_NORMALIZING_FACTOR = None
|
19 |
+
MIXTURE_OR_TASK_NAME = 'categorise'
|
20 |
+
MODEL = @models.EncoderDecoderModel()
|
21 |
+
OPTIMIZER = @adafactor.Adafactor()
|
22 |
+
VOCABULARY = @seqio.SentencePieceVocabulary()
|
23 |
+
Z_LOSS = 0.0001
|
24 |
+
|
25 |
+
# Parameters for adafactor.Adafactor:
|
26 |
+
# ==============================================================================
|
27 |
+
adafactor.Adafactor.decay_rate = 0.8
|
28 |
+
adafactor.Adafactor.logical_factor_rules = \
|
29 |
+
@adafactor.standard_logical_factor_rules()
|
30 |
+
adafactor.Adafactor.step_offset = 0
|
31 |
+
|
32 |
+
# Parameters for utils.DatasetConfig:
|
33 |
+
# ==============================================================================
|
34 |
+
utils.DatasetConfig.batch_size = 32
|
35 |
+
utils.DatasetConfig.mixture_or_task_name = %MIXTURE_OR_TASK_NAME
|
36 |
+
utils.DatasetConfig.seed = 42
|
37 |
+
utils.DatasetConfig.shuffle = False
|
38 |
+
utils.DatasetConfig.split = 'validation'
|
39 |
+
utils.DatasetConfig.task_feature_lengths = {'inputs': 512, 'targets': 2}
|
40 |
+
|
41 |
+
# Parameters for models.EncoderDecoderModel:
|
42 |
+
# ==============================================================================
|
43 |
+
models.EncoderDecoderModel.input_vocabulary = %VOCABULARY
|
44 |
+
models.EncoderDecoderModel.label_smoothing = %LABEL_SMOOTHING
|
45 |
+
models.EncoderDecoderModel.loss_normalizing_factor = %LOSS_NORMALIZING_FACTOR
|
46 |
+
models.EncoderDecoderModel.module = @network.Transformer()
|
47 |
+
models.EncoderDecoderModel.optimizer_def = %OPTIMIZER
|
48 |
+
models.EncoderDecoderModel.output_vocabulary = %VOCABULARY
|
49 |
+
models.EncoderDecoderModel.z_loss = %Z_LOSS
|
50 |
+
|
51 |
+
# Parameters for eval_script.evaluate:
|
52 |
+
# ==============================================================================
|
53 |
+
eval_script.evaluate.dataset_cfg = @utils.DatasetConfig()
|
54 |
+
eval_script.evaluate.model = %MODEL
|
55 |
+
eval_script.evaluate.output_dir = %EVAL_OUTPUT_DIR
|
56 |
+
eval_script.evaluate.partitioner = @partitioning.PjitPartitioner()
|
57 |
+
eval_script.evaluate.restore_checkpoint_cfg = @utils.RestoreCheckpointConfig()
|
58 |
+
|
59 |
+
# Parameters for partitioning.PjitPartitioner:
|
60 |
+
# ==============================================================================
|
61 |
+
partitioning.PjitPartitioner.num_partitions = 2
|
62 |
+
|
63 |
+
# Parameters for utils.RestoreCheckpointConfig:
|
64 |
+
# ==============================================================================
|
65 |
+
utils.RestoreCheckpointConfig.mode = 'specific'
|
66 |
+
utils.RestoreCheckpointConfig.path = %CHECKPOINT_PATH
|
67 |
+
|
68 |
+
# Parameters for seqio.SentencePieceVocabulary:
|
69 |
+
# ==============================================================================
|
70 |
+
seqio.SentencePieceVocabulary.sentencepiece_model_file = \
|
71 |
+
'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model'
|
72 |
+
|
73 |
+
# Parameters for network.T5Config:
|
74 |
+
# ==============================================================================
|
75 |
+
network.T5Config.dropout_rate = %DROPOUT_RATE
|
76 |
+
network.T5Config.dtype = 'bfloat16'
|
77 |
+
network.T5Config.emb_dim = 768
|
78 |
+
network.T5Config.head_dim = 64
|
79 |
+
network.T5Config.logits_via_embedding = False
|
80 |
+
network.T5Config.mlp_activations = ('gelu', 'linear')
|
81 |
+
network.T5Config.mlp_dim = 2048
|
82 |
+
network.T5Config.num_decoder_layers = 12
|
83 |
+
network.T5Config.num_encoder_layers = 12
|
84 |
+
network.T5Config.num_heads = 12
|
85 |
+
network.T5Config.vocab_size = 250112
|
86 |
+
|
87 |
+
# Parameters for network.Transformer:
|
88 |
+
# ==============================================================================
|
89 |
+
network.Transformer.config = @network.T5Config()
|
log/eval_results_t1v-n-7b23714e-w-0.jsonl
CHANGED
@@ -3,3 +3,5 @@
|
|
3 |
{"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 17:40:27", "task": "categorise", "feature_length": null, "split": "validation", "result": {"accuracy": 86.33333333333333, "f1_macro": 86.33090327169275}}
|
4 |
{"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 18:07:14", "task": "categorise", "feature_length": null, "split": "validation", "result": {"accuracy": 86.33333333333333, "f1_macro": 86.33090327169275}}
|
5 |
{"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 18:31:25", "task": "categorise", "feature_length": {"inputs": 512, "targets": 2}, "split": "validation", "result": {"accuracy": 84.83333333333334, "f1_macro": 84.82911919977771}}
|
|
|
|
|
|
3 |
{"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 17:40:27", "task": "categorise", "feature_length": null, "split": "validation", "result": {"accuracy": 86.33333333333333, "f1_macro": 86.33090327169275}}
|
4 |
{"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 18:07:14", "task": "categorise", "feature_length": null, "split": "validation", "result": {"accuracy": 86.33333333333333, "f1_macro": 86.33090327169275}}
|
5 |
{"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "08-04-2022 18:31:25", "task": "categorise", "feature_length": {"inputs": 512, "targets": 2}, "split": "validation", "result": {"accuracy": 84.83333333333334, "f1_macro": 84.82911919977771}}
|
6 |
+
{"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "11-04-2022 06:48:47", "split": "validation", "result": {"accuracy": 84.83333333333334}}
|
7 |
+
{"model": "gs://nb-t5x/eval_norwegian_NCC_2_000_000/checkpoint_2005000", "eval_date": "11-04-2022 07:01:50", "split": "validation", "feature_length": {"inputs": 512, "targets": 2}, "eval_batch_size": 32, "result": {"accuracy": 84.83333333333334, "f1_macro": 84.82911919977771}}
|
log/model-info.txt
ADDED
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Variable decoder/decoder_norm/scale size 768 shape (embed=768) partition spec (None,)
|
2 |
+
Variable decoder/layers_0/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
3 |
+
Variable decoder/layers_0/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
4 |
+
Variable decoder/layers_0/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
5 |
+
Variable decoder/layers_0/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
6 |
+
Variable decoder/layers_0/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
7 |
+
Variable decoder/layers_0/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
8 |
+
Variable decoder/layers_0/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
9 |
+
Variable decoder/layers_0/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
10 |
+
Variable decoder/layers_0/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
11 |
+
Variable decoder/layers_0/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
12 |
+
Variable decoder/layers_0/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
13 |
+
Variable decoder/layers_0/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
14 |
+
Variable decoder/layers_0/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
15 |
+
Variable decoder/layers_0/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
16 |
+
Variable decoder/layers_1/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
17 |
+
Variable decoder/layers_1/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
18 |
+
Variable decoder/layers_1/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
19 |
+
Variable decoder/layers_1/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
20 |
+
Variable decoder/layers_1/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
21 |
+
Variable decoder/layers_1/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
22 |
+
Variable decoder/layers_1/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
23 |
+
Variable decoder/layers_1/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
24 |
+
Variable decoder/layers_1/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
25 |
+
Variable decoder/layers_1/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
26 |
+
Variable decoder/layers_1/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
27 |
+
Variable decoder/layers_1/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
28 |
+
Variable decoder/layers_1/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
29 |
+
Variable decoder/layers_1/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
30 |
+
Variable decoder/layers_10/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
31 |
+
Variable decoder/layers_10/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
32 |
+
Variable decoder/layers_10/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
33 |
+
Variable decoder/layers_10/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
34 |
+
Variable decoder/layers_10/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
35 |
+
Variable decoder/layers_10/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
36 |
+
Variable decoder/layers_10/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
37 |
+
Variable decoder/layers_10/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
38 |
+
Variable decoder/layers_10/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
39 |
+
Variable decoder/layers_10/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
40 |
+
Variable decoder/layers_10/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
41 |
+
Variable decoder/layers_10/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
42 |
+
Variable decoder/layers_10/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
43 |
+
Variable decoder/layers_10/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
44 |
+
Variable decoder/layers_11/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
45 |
+
Variable decoder/layers_11/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
46 |
+
Variable decoder/layers_11/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
47 |
+
Variable decoder/layers_11/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
48 |
+
Variable decoder/layers_11/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
49 |
+
Variable decoder/layers_11/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
50 |
+
Variable decoder/layers_11/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
51 |
+
Variable decoder/layers_11/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
52 |
+
Variable decoder/layers_11/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
53 |
+
Variable decoder/layers_11/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
54 |
+
Variable decoder/layers_11/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
55 |
+
Variable decoder/layers_11/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
56 |
+
Variable decoder/layers_11/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
57 |
+
Variable decoder/layers_11/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
58 |
+
Variable decoder/layers_2/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
59 |
+
Variable decoder/layers_2/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
60 |
+
Variable decoder/layers_2/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
61 |
+
Variable decoder/layers_2/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
62 |
+
Variable decoder/layers_2/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
63 |
+
Variable decoder/layers_2/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
64 |
+
Variable decoder/layers_2/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
65 |
+
Variable decoder/layers_2/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
66 |
+
Variable decoder/layers_2/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
67 |
+
Variable decoder/layers_2/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
68 |
+
Variable decoder/layers_2/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
69 |
+
Variable decoder/layers_2/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
70 |
+
Variable decoder/layers_2/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
71 |
+
Variable decoder/layers_2/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
72 |
+
Variable decoder/layers_3/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
73 |
+
Variable decoder/layers_3/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
74 |
+
Variable decoder/layers_3/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
75 |
+
Variable decoder/layers_3/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
76 |
+
Variable decoder/layers_3/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
77 |
+
Variable decoder/layers_3/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
78 |
+
Variable decoder/layers_3/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
79 |
+
Variable decoder/layers_3/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
80 |
+
Variable decoder/layers_3/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
81 |
+
Variable decoder/layers_3/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
82 |
+
Variable decoder/layers_3/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
83 |
+
Variable decoder/layers_3/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
84 |
+
Variable decoder/layers_3/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
85 |
+
Variable decoder/layers_3/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
86 |
+
Variable decoder/layers_4/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
87 |
+
Variable decoder/layers_4/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
88 |
+
Variable decoder/layers_4/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
89 |
+
Variable decoder/layers_4/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
90 |
+
Variable decoder/layers_4/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
91 |
+
Variable decoder/layers_4/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
92 |
+
Variable decoder/layers_4/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
93 |
+
Variable decoder/layers_4/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
94 |
+
Variable decoder/layers_4/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
95 |
+
Variable decoder/layers_4/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
96 |
+
Variable decoder/layers_4/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
97 |
+
Variable decoder/layers_4/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
98 |
+
Variable decoder/layers_4/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
99 |
+
Variable decoder/layers_4/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
100 |
+
Variable decoder/layers_5/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
101 |
+
Variable decoder/layers_5/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
102 |
+
Variable decoder/layers_5/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
103 |
+
Variable decoder/layers_5/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
104 |
+
Variable decoder/layers_5/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
105 |
+
Variable decoder/layers_5/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
106 |
+
Variable decoder/layers_5/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
107 |
+
Variable decoder/layers_5/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
108 |
+
Variable decoder/layers_5/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
109 |
+
Variable decoder/layers_5/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
110 |
+
Variable decoder/layers_5/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
111 |
+
Variable decoder/layers_5/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
112 |
+
Variable decoder/layers_5/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
113 |
+
Variable decoder/layers_5/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
114 |
+
Variable decoder/layers_6/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
115 |
+
Variable decoder/layers_6/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
116 |
+
Variable decoder/layers_6/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
117 |
+
Variable decoder/layers_6/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
118 |
+
Variable decoder/layers_6/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
119 |
+
Variable decoder/layers_6/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
120 |
+
Variable decoder/layers_6/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
121 |
+
Variable decoder/layers_6/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
122 |
+
Variable decoder/layers_6/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
123 |
+
Variable decoder/layers_6/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
124 |
+
Variable decoder/layers_6/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
125 |
+
Variable decoder/layers_6/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
126 |
+
Variable decoder/layers_6/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
127 |
+
Variable decoder/layers_6/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
128 |
+
Variable decoder/layers_7/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
129 |
+
Variable decoder/layers_7/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
130 |
+
Variable decoder/layers_7/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
131 |
+
Variable decoder/layers_7/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
132 |
+
Variable decoder/layers_7/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
133 |
+
Variable decoder/layers_7/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
134 |
+
Variable decoder/layers_7/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
135 |
+
Variable decoder/layers_7/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
136 |
+
Variable decoder/layers_7/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
137 |
+
Variable decoder/layers_7/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
138 |
+
Variable decoder/layers_7/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
139 |
+
Variable decoder/layers_7/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
140 |
+
Variable decoder/layers_7/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
141 |
+
Variable decoder/layers_7/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
142 |
+
Variable decoder/layers_8/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
143 |
+
Variable decoder/layers_8/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
144 |
+
Variable decoder/layers_8/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
145 |
+
Variable decoder/layers_8/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
146 |
+
Variable decoder/layers_8/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
147 |
+
Variable decoder/layers_8/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
148 |
+
Variable decoder/layers_8/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
149 |
+
Variable decoder/layers_8/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
150 |
+
Variable decoder/layers_8/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
151 |
+
Variable decoder/layers_8/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
152 |
+
Variable decoder/layers_8/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
153 |
+
Variable decoder/layers_8/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
154 |
+
Variable decoder/layers_8/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
155 |
+
Variable decoder/layers_8/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
156 |
+
Variable decoder/layers_9/encoder_decoder_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
157 |
+
Variable decoder/layers_9/encoder_decoder_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
158 |
+
Variable decoder/layers_9/encoder_decoder_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
159 |
+
Variable decoder/layers_9/encoder_decoder_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
160 |
+
Variable decoder/layers_9/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
161 |
+
Variable decoder/layers_9/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
162 |
+
Variable decoder/layers_9/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
163 |
+
Variable decoder/layers_9/pre_cross_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
164 |
+
Variable decoder/layers_9/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
165 |
+
Variable decoder/layers_9/pre_self_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
166 |
+
Variable decoder/layers_9/self_attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
167 |
+
Variable decoder/layers_9/self_attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
168 |
+
Variable decoder/layers_9/self_attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
169 |
+
Variable decoder/layers_9/self_attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
170 |
+
Variable decoder/logits_dense/kernel size 192086016 shape (embed=768, vocab=250112) partition spec (None, 'model')
|
171 |
+
Variable decoder/relpos_bias/rel_embedding size 384 shape (heads=12, relpos_buckets=32) partition spec ('model', None)
|
172 |
+
Variable encoder/encoder_norm/scale size 768 shape (embed=768) partition spec (None,)
|
173 |
+
Variable encoder/layers_0/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
174 |
+
Variable encoder/layers_0/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
175 |
+
Variable encoder/layers_0/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
176 |
+
Variable encoder/layers_0/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
177 |
+
Variable encoder/layers_0/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
178 |
+
Variable encoder/layers_0/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
179 |
+
Variable encoder/layers_0/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
180 |
+
Variable encoder/layers_0/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
181 |
+
Variable encoder/layers_0/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
182 |
+
Variable encoder/layers_1/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
183 |
+
Variable encoder/layers_1/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
184 |
+
Variable encoder/layers_1/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
185 |
+
Variable encoder/layers_1/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
186 |
+
Variable encoder/layers_1/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
187 |
+
Variable encoder/layers_1/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
188 |
+
Variable encoder/layers_1/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
189 |
+
Variable encoder/layers_1/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
190 |
+
Variable encoder/layers_1/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
191 |
+
Variable encoder/layers_10/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
192 |
+
Variable encoder/layers_10/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
193 |
+
Variable encoder/layers_10/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
194 |
+
Variable encoder/layers_10/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
195 |
+
Variable encoder/layers_10/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
196 |
+
Variable encoder/layers_10/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
197 |
+
Variable encoder/layers_10/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
198 |
+
Variable encoder/layers_10/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
199 |
+
Variable encoder/layers_10/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
200 |
+
Variable encoder/layers_11/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
201 |
+
Variable encoder/layers_11/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
202 |
+
Variable encoder/layers_11/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
203 |
+
Variable encoder/layers_11/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
204 |
+
Variable encoder/layers_11/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
205 |
+
Variable encoder/layers_11/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
206 |
+
Variable encoder/layers_11/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
207 |
+
Variable encoder/layers_11/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
208 |
+
Variable encoder/layers_11/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
209 |
+
Variable encoder/layers_2/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
210 |
+
Variable encoder/layers_2/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
211 |
+
Variable encoder/layers_2/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
212 |
+
Variable encoder/layers_2/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
213 |
+
Variable encoder/layers_2/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
214 |
+
Variable encoder/layers_2/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
215 |
+
Variable encoder/layers_2/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
216 |
+
Variable encoder/layers_2/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
217 |
+
Variable encoder/layers_2/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
218 |
+
Variable encoder/layers_3/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
219 |
+
Variable encoder/layers_3/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
220 |
+
Variable encoder/layers_3/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
221 |
+
Variable encoder/layers_3/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
222 |
+
Variable encoder/layers_3/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
223 |
+
Variable encoder/layers_3/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
224 |
+
Variable encoder/layers_3/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
225 |
+
Variable encoder/layers_3/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
226 |
+
Variable encoder/layers_3/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
227 |
+
Variable encoder/layers_4/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
228 |
+
Variable encoder/layers_4/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
229 |
+
Variable encoder/layers_4/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
230 |
+
Variable encoder/layers_4/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
231 |
+
Variable encoder/layers_4/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
232 |
+
Variable encoder/layers_4/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
233 |
+
Variable encoder/layers_4/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
234 |
+
Variable encoder/layers_4/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
235 |
+
Variable encoder/layers_4/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
236 |
+
Variable encoder/layers_5/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
237 |
+
Variable encoder/layers_5/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
238 |
+
Variable encoder/layers_5/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
239 |
+
Variable encoder/layers_5/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
240 |
+
Variable encoder/layers_5/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
241 |
+
Variable encoder/layers_5/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
242 |
+
Variable encoder/layers_5/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
243 |
+
Variable encoder/layers_5/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
244 |
+
Variable encoder/layers_5/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
245 |
+
Variable encoder/layers_6/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
246 |
+
Variable encoder/layers_6/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
247 |
+
Variable encoder/layers_6/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
248 |
+
Variable encoder/layers_6/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
249 |
+
Variable encoder/layers_6/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
250 |
+
Variable encoder/layers_6/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
251 |
+
Variable encoder/layers_6/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
252 |
+
Variable encoder/layers_6/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
253 |
+
Variable encoder/layers_6/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
254 |
+
Variable encoder/layers_7/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
255 |
+
Variable encoder/layers_7/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
256 |
+
Variable encoder/layers_7/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
257 |
+
Variable encoder/layers_7/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
258 |
+
Variable encoder/layers_7/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
259 |
+
Variable encoder/layers_7/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
260 |
+
Variable encoder/layers_7/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
261 |
+
Variable encoder/layers_7/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
262 |
+
Variable encoder/layers_7/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
263 |
+
Variable encoder/layers_8/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
264 |
+
Variable encoder/layers_8/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
265 |
+
Variable encoder/layers_8/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
266 |
+
Variable encoder/layers_8/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
267 |
+
Variable encoder/layers_8/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
268 |
+
Variable encoder/layers_8/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
269 |
+
Variable encoder/layers_8/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
270 |
+
Variable encoder/layers_8/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
271 |
+
Variable encoder/layers_8/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
272 |
+
Variable encoder/layers_9/attention/key/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
273 |
+
Variable encoder/layers_9/attention/out/kernel size 589824 shape (joined_kv=768, embed=768) partition spec ('model', None)
|
274 |
+
Variable encoder/layers_9/attention/query/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
275 |
+
Variable encoder/layers_9/attention/value/kernel size 589824 shape (embed=768, joined_kv=768) partition spec (None, 'model')
|
276 |
+
Variable encoder/layers_9/mlp/wi_0/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
277 |
+
Variable encoder/layers_9/mlp/wi_1/kernel size 1572864 shape (embed=768, mlp=2048) partition spec (None, 'model')
|
278 |
+
Variable encoder/layers_9/mlp/wo/kernel size 1572864 shape (mlp=2048, embed=768) partition spec ('model', None)
|
279 |
+
Variable encoder/layers_9/pre_attention_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
280 |
+
Variable encoder/layers_9/pre_mlp_layer_norm/scale size 768 shape (embed=768) partition spec (None,)
|
281 |
+
Variable encoder/relpos_bias/rel_embedding size 384 shape (heads=12, relpos_buckets=32) partition spec ('model', None)
|
282 |
+
Variable token_embedder/embedding size 192086016 shape (vocab=250112, embed=768) partition spec ('model', None)
|
283 |
+
Total number of parameters: 582401280
|
284 |
+
|
285 |
+
Variable step size 1 shape () partition spec None
|
tasks.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
|
3 |
import functools
|
4 |
import seqio
|
|
|
5 |
import tensorflow_datasets as tfds
|
6 |
from t5.evaluation import metrics
|
7 |
from t5.data import preprocessors
|
@@ -59,7 +60,7 @@ seqio.TaskRegistry.add(
|
|
59 |
categorise_preprocessor,
|
60 |
seqio.preprocessors.tokenize_and_append_eos,
|
61 |
],
|
62 |
-
metric_fns=[metrics.accuracy],
|
63 |
output_features=DEFAULT_OUTPUT_FEATURES,
|
64 |
)
|
65 |
|
|
|
2 |
|
3 |
import functools
|
4 |
import seqio
|
5 |
+
import my_metrics
|
6 |
import tensorflow_datasets as tfds
|
7 |
from t5.evaluation import metrics
|
8 |
from t5.data import preprocessors
|
|
|
60 |
categorise_preprocessor,
|
61 |
seqio.preprocessors.tokenize_and_append_eos,
|
62 |
],
|
63 |
+
metric_fns=[metrics.accuracy,my_metrics.f1_macro],
|
64 |
output_features=DEFAULT_OUTPUT_FEATURES,
|
65 |
)
|
66 |
|