name: "enaf_transformer" data: src: "en" trg: "af" train: "data/enaf/train.bpe" dev: "data/enaf/dev.bpe" test: "data/enaf/test.bpe" level: "bpe" lowercase: False max_sent_length: 100 src_vocab: "data/enaf/vocab.txt" trg_vocab: "data/enaf/vocab.txt" testing: beam_size: 5 alpha: 1.0 training: #load_model: "/content/drive/My Drive/masakhane/en-af-baseline/models/enaf_transformer/1.ckpt" # if uncommented, load a pre-trained model from this checkpoint #load_model: "/content/drive/My Drive/masakhane/en-af-baseline/joeynmt/models/enaf_transformer/3500.ckpt" # if uncommented, load a pre-trained model from this checkpoint random_seed: 42 optimizer: "adam" normalization: "tokens" adam_betas: [0.9, 0.999] scheduling: "noam" # Try switching to Elan scheduling learning_rate_decay_length: 5000 # number of steps to reduce by the decay factor for Elan method learning_rate_peak: 0.002 # peak for Elan scheduler (default: 1) learning_rate_warmup: 2000 # warmup steps for Elan scheduler learning_rate_factor: 1 # factor for Noam scheduler (used with Transformer) learning_rate_warmup: 1000 # warmup steps for Noam scheduler (used with Transformer) patience: 8 decrease_factor: 0.7 loss: "crossentropy" learning_rate: 0.0002 learning_rate_min: 0.00000001 weight_decay: 0.0 label_smoothing: 0.1 batch_size: 4096 batch_type: "token" eval_batch_size: 3600 eval_batch_type: "token" batch_multiplier: 1 early_stopping_metric: "ppl" epochs: 30 # TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all validation_freq: 500 # 4000 # Decrease this for testing logging_freq: 100 eval_metric: "bleu" model_dir: "models/enaf_transformer" overwrite: True shuffle: True use_cuda: True max_output_length: 100 print_valid_sents: [0, 1, 2, 3] keep_last_ckpts: 3 model: initializer: "xavier" bias_initializer: "zeros" init_gain: 1.0 embed_initializer: "xavier" embed_init_gain: 1.0 tied_embeddings: True tied_softmax: True encoder: type: "transformer" num_layers: 3 num_heads: 8 embeddings: embedding_dim: 512 scale: True dropout: 0. # typically ff_size = 4 x hidden_size hidden_size: 512 ff_size: 2048 dropout: 0.3 decoder: type: "transformer" num_layers: 3 num_heads: 8 embeddings: embedding_dim: 512 scale: True dropout: 0. # typically ff_size = 4 x hidden_size hidden_size: 512 ff_size: 2048 dropout: 0.25