merit-albert-v2-xxlarge-v1 / training_config.yaml
chitanda's picture
upload models
4622991
train_file: /home/share/jiaofangkai/wiki_erica_path/v7/union/train_distant.path_v7.train.0.pkl
dev_file: /home/share/jiaofangkai/wiki_erica_path/v7/union/train_distant.path_v7.dev.pkl
test_file: null
model:
_target_: models.albert_baseline.AlbertForMultipleChoicePreTrain.from_pretrained
mlp_hidden_size: 8192
read_tensor:
_target_: dataset.wiki_entity_path_v8_2.convert_examples_into_features
max_neg_num: 3
aug_num: 1
max_seq_length: 256
shuffle_context: true
min_rep_num: 5
geo_p: 0.4
deduct_ratio: 1.0
context_ratio: 1.0
num_workers: 64
extended_vocab: null
collator:
_target_: dataset.wiki_entity_path_v8.WikiPathDatasetCollatorWithContext
max_seq_length: 256
tokenizer: pretrained-models/albert-xxlarge-v2
mlm_probability: 0.15
max_option_num: 4
swap: true
num_workers: 8
prefetch_factor: 2
model_name_or_path: pretrained-models/albert-xxlarge-v2
pretrain: null
output_dir: experiments/albert.xxlarge.path.v7_v8.2.2.1aug.ctx.TeslaT4
do_train: Train
evaluate_during_training: true
do_eval: false
eval_sub_path: null
do_preprocess: false
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
learning_rate: 5.0e-05
gradient_accumulation_steps: 2048
weight_decay: 0.01
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.98)
max_grad_norm: 5.0
num_train_epochs: 1
max_steps: 100
warmup_proportion: 0.2
warmup_steps: 0
optimizer: lamb
use_nvlamb: true
logging_steps: 1
save_steps: 50
eval_steps: 50
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O1
reshard_after_forward: false
cpu_offload: false
move_grads_to_cpu: false
move_params_to_cpu: false
n_gpu: 1
device: cuda:0
train_batch_size: 1
eval_batch_size: 1
note: null