Nos_MT-OpenNMT-gl-es / bpe-gl-es_emb.yaml
imdbo's picture
Create bpe-gl-es_emb.yaml
7bae329
raw
history blame
No virus
4.63 kB
save_data: run
## Where the vocab(s) will be written
src_vocab: run/vocab/es-gl/bpe.vocab.src
tgt_vocab: run/vocab/es-gl/bpe.vocab.tgt
overwrite: True
# Corpus opts:
data:
europarl:
path_tgt: ../DGTcorpora_tokenized/es_gz/europarl/partitions/es_train.txt
path_src: ../DGTcorpora_tokenized/es_gz/europarl_translit/partitions/gl_train.txt
transforms: [bpe, filtertoolong]
weight: 120 #60 #120
opensub:
path_tgt: ../DGTcorpora_tokenized/es_gz/opensub/partitions/es_train.txt
path_src: ../DGTcorpora_tokenized/es_gz/opensub_translit/partitions/gl_train.txt
transforms: [bpe, filtertoolong]
weight: 180 #900 #180
dgt:
path_tgt: ../DGTcorpora_tokenized/es_gz/dgt/partitions/es_train.txt
path_src: ../DGTcorpora_tokenized/es_gz/dgt_translit/partitions/gl_train.txt
transforms: [bpe, filtertoolong]
weight: 18 #9 #18
cluvi:
path_tgt: ../DGTcorpora_tokenized/es_gz/cluvi/partitions/es_train.txt
path_src: ../DGTcorpora_tokenized/es_gz/cluvi/partitions/gl_train.txt
transforms: [bpe, filtertoolong]
weight: 40 # 4 #40
opensub-es-gl:
path_tgt: ../DGTcorpora_tokenized/es_gz/opensub-es-gl/partitions/es_train.txt
path_src: ../DGTcorpora_tokenized/es_gz/opensub-es-gl/partitions/gl_train.txt
transforms: [bpe, filtertoolong]
weight: 25 # 5 #25 #25
ted2020:
path_tgt: ../DGTcorpora_tokenized/es_gz/ted2020/partitions/es_train.txt
path_src: ../DGTcorpora_tokenized/es_gz/ted2020/partitions/gl_train.txt
transforms: [bpe, filtertoolong]
weight: 10 # 1 #10 #10
corgaback:
path_tgt: ../DGTcorpora_tokenized/es_gz/corgaback/partitions/es_train.txt
path_src: ../DGTcorpora_tokenized/es_gz/corgaback/partitions/gl_train.txt
transforms: [bpe, filtertoolong]
weight: 13 # 66 #14 #13
ccmatrix:
path_tgt: ../DGTcorpora_tokenized/es_gz/ccmatrix/es.txt
path_src: ../DGTcorpora_tokenized/es_gz/ccmatrix/gl.txt
transforms: [bpe, filtertoolong]
weight: 180 ##como opensub, tamanho semelhante
resto:
path_tgt: ../DGTcorpora_tokenized/es_gz/resto/es.txt
path_src: ../DGTcorpora_tokenized/es_gz/resto/gl.txt
transforms: [bpe, filtertoolong]
weight: 120 ##como europarl, tamanho semelhante
opensub_2018:
path_tgt: ../DGTcorpora_tokenized/es_gz/opensub_2018/es.txt
path_src: ../DGTcorpora_tokenized/es_gz/opensub_2018/gl.txt
transforms: [bpe, filtertoolong]
weight: 25 #igual que opensub_es-gl
valid:
path_tgt: ../DGTcorpora_tokenized/es_gz/partitions/all-es_valid.txt
path_src: ../DGTcorpora_tokenized/es_gz/partitions_translit/all-gl_valid.txt
transforms: [bpe, filtertoolong]
### Transform related opts:
#### Subword
tgt_subword_model: ./bpe/es.code
src_subword_model: ./bpe/gl.code
tgt_subword_vocab: ./run/vocab/es-gl/bpe.vocab.src
src_subword_vocab: ./run/vocab/es-gl/bpe.vocab.tgt
#tgt_subword_model: ../sentencepiece/en-gl/en.sp.model
#src_subword_model: ../sentencepiece/en-gl/gl.sp.model
src_subword_type: bpe
tgt_subord_type: bpe
src_subword_nbest: 1
src_subword_alpha: 0.0
tgt_subword_nbest: 1
tgt_subword_alpha: 0.0
##embeddings
tgt_embeddings: ../embeddings/es.emb.txt
src_embeddings: ../embeddings/gl.emb.txt
## supported types: GloVe, word2vec
embeddings_type: "word2vec"
# word_vec_size need to match with the pretrained embeddings dimensions
word_vec_size: 300
#### Filter
src_seq_length: 150
tgt_seq_length: 150
# silently ignore empty lines in the data
skip_empty_level: silent
# General opts
save_model: run/model
keep_checkpoint: 50
save_checkpoint_steps: 10000
average_decay: 0.0005
seed: 1234
report_every: 1000
train_steps: 200000
valid_steps: 10000
# Batching
queue_size: 10000
bucket_size: 32768
world_size: 1
gpu_ranks: [0]
batch_type: "tokens"
#batch_size: 4096
batch_size: 8192
valid_batch_size: 64
batch_size_multiple: 1
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]
# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2
#learning_rate: 0.00005
warmup_steps: 8000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"
# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
rnn_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
share_decoder_embeddings: true
share_embeddings: false