# LongT5 Base model. Config based on T5.1.1 Base model. # Provides MODEL from __gin__ import dynamic_registration import seqio from t5x import adafactor from t5x import models import tasks ARCHITECTURE = %gin.REQUIRED include 'flaxformer/t5x/configs/longt5/architectures/longt5_1_1_flaxformer.gin' include 't5x/configs/runs/pretrain.gin' #include 'pretrain_cont.gin' MIXTURE_OR_TASK_NAME = "ncc_scandinavian_span_corruption_stream" TASK_FEATURE_LENGTHS = {"inputs": 4048, "targets": 910} # CORRECT IS 128!! BATCH_SIZE=32 TRAIN_STEPS = 1_000_000 DROPOUT_RATE = 0.0 # Changed from the default since T5-1.1 recomments this. #INITIAL_CHECKPOINT_PATH = "gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_base/checkpoint_1500000" #PjitPartitioner.num_partitions = 1 # Architecture overrides NUM_HEADS = 12 NUM_ENCODER_LAYERS = 12 NUM_DECODER_LAYERS = 12 HEAD_DIM = 64 EMBED_DIM = 768 MLP_DIM = 2048 # Loss HParam defaults Z_LOSS = 0.0001 LABEL_SMOOTHING = 0.0 LOSS_NORMALIZING_FACTOR = None # Vocabulary (shared by encoder and decoder) VOCABULARY = @seqio.SentencePieceVocabulary() seqio.SentencePieceVocabulary.sentencepiece_model_file = "gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model" NUM_EMBEDDINGS = 32128 # vocab size rounded to a multiple of 128 for TPU efficiency # Optimizer # `learning_rate` is set by `Trainer.learning_rate_fn`. OPTIMIZER = @adafactor.Adafactor() adafactor.Adafactor: decay_rate = 0.8 step_offset = 0 # Model MODEL = @models.EncoderDecoderModel() models.EncoderDecoderModel: module = %ARCHITECTURE # provided by longt5_flaxformer input_vocabulary = %VOCABULARY output_vocabulary = %VOCABULARY optimizer_def = %OPTIMIZER z_loss = %Z_LOSS label_smoothing = %LABEL_SMOOTHING loss_normalizing_factor = %LOSS_NORMALIZING_FACTOR