name: "dyu_fr_transformer-sp" | |
joeynmt_version: "2.3.0" | |
model_dir: "../saved_model/lean_model" | |
use_cuda: False # False for CPU training | |
fp16: False | |
data: | |
train: "../data/dyu_fr" | |
dev: "../data/dyu_fr" | |
test: "../data/dyu_fr" | |
dataset_type: "huggingface" | |
dataset_cfg: | |
name: "dyu-fr" | |
sample_dev_subset: 1460 | |
src: | |
lang: "dyu" | |
max_length: 100 | |
lowercase: False | |
normalize: False | |
level: "bpe" | |
voc_limit: 4000 | |
voc_min_freq: 1 | |
voc_file: "../saved_model/lean_model/vocab.txt" | |
tokenizer_type: "sentencepiece" | |
tokenizer_cfg: | |
model_file: "../saved_model/lean_model/sp.model" | |
trg: | |
lang: "fr" | |
max_length: 100 | |
lowercase: False | |
normalize: False | |
level: "bpe" | |
voc_limit: 4000 | |
voc_min_freq: 1 | |
voc_file: "../saved_model/lean_model/vocab.txt" | |
tokenizer_type: "sentencepiece" | |
tokenizer_cfg: | |
model_file: "../saved_model/lean_model/sp.model" | |
special_symbols: | |
unk_token: "<unk>" | |
unk_id: 0 | |
pad_token: "<pad>" | |
pad_id: 1 | |
bos_token: "<s>" | |
bos_id: 2 | |
eos_token: "</s>" | |
eos_id: 3 | |
testing: | |
load_model: "../saved_model/lean_model/best.ckpt" | |
n_best: 1 | |
beam_size: 5 | |
beam_alpha: 1.0 | |
batch_size: 256 | |
batch_type: "token" | |
max_output_length: 100 | |
eval_metrics: ["bleu"] | |
#return_prob: "hyp" | |
#return_attention: False | |
sacrebleu_cfg: | |
tokenize: "13a" | |
training: | |
#load_model: "../saved_model/lean_model/latest.ckpt" | |
#reset_best_ckpt: False | |
#reset_scheduler: False | |
#reset_optimizer: False | |
#reset_iter_state: False | |
random_seed: 42 | |
optimizer: "adamw" | |
normalization: "tokens" | |
adam_betas: [0.9, 0.999] | |
scheduling: "warmupinversesquareroot" | |
learning_rate_warmup: 100 | |
learning_rate: 0.0003 | |
learning_rate_min: 0.00000001 | |
weight_decay: 0.0 | |
label_smoothing: 0.1 | |
loss: "crossentropy" | |
batch_size: 512 | |
batch_type: "token" | |
batch_multiplier: 4 | |
early_stopping_metric: "bleu" | |
epochs: 6 | |
updates: 550 | |
validation_freq: 30 | |
logging_freq: 5 | |
overwrite: True | |
shuffle: True | |
print_valid_sents: [0, 1, 2, 3] | |
keep_best_ckpts: 3 | |
model: | |
initializer: "xavier_uniform" | |
bias_initializer: "zeros" | |
init_gain: 1.0 | |
embed_initializer: "xavier_uniform" | |
embed_init_gain: 1.0 | |
tied_embeddings: True | |
tied_softmax: True | |
encoder: | |
type: "transformer" | |
num_layers: 6 | |
num_heads: 4 | |
embeddings: | |
embedding_dim: 256 | |
scale: True | |
dropout: 0.0 | |
# typically ff_size = 4 x hidden_size | |
hidden_size: 256 | |
ff_size: 1024 | |
dropout: 0.2 | |
layer_norm: "pre" | |
decoder: | |
type: "transformer" | |
num_layers: 6 | |
num_heads: 8 | |
embeddings: | |
embedding_dim: 256 | |
scale: True | |
dropout: 0.0 | |
# typically ff_size = 4 x hidden_size | |
hidden_size: 256 | |
ff_size: 1024 | |
dropout: 0.1 | |
layer_norm: "pre" | |