|
lang=ruby |
|
current_time=$(date "+%Y%m%d%H%M%S") |
|
|
|
|
|
code_length=64 |
|
nl_length=64 |
|
|
|
model_type=multi-loss-cocosoda |
|
moco_k=1024 |
|
moco_m=0.999 |
|
lr=2e-5 |
|
moco_t=0.07 |
|
|
|
epoch=10 |
|
batch_size=128 |
|
max_steps=100000 |
|
save_steps=1000 |
|
data_aug_type="replace_type" |
|
couninue_pre_train_data_files='dataset/java/train.jsonl dataset/javascript/train.jsonl dataset/python/train.jsonl dataset/php/train.jsonl dataset/go/train.jsonl dataset/ruby/train.jsonl' |
|
CUDA_VISIBLE_DEVICES="0,1" |
|
base_model=unixcoder |
|
|
|
function continue_pre_train () { |
|
output_dir=./saved_models/cocosoda/ |
|
mkdir -p $output_dir |
|
echo ${output_dir} |
|
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} python run.py --eval_frequency 100 \ |
|
--moco_m ${moco_m} --moco_t ${moco_t} \ |
|
--output_dir ${output_dir} \ |
|
--moco_k ${moco_k} \ |
|
--model_type ${model_type} \ |
|
--data_aug_type other \ |
|
--config_name=microsoft/${base_model}-base \ |
|
--model_name_or_path=microsoft/${base_model}-base \ |
|
--tokenizer_name=microsoft/${base_model}-base \ |
|
--lang=$lang \ |
|
--do_test \ |
|
--time_score 1 \ |
|
--do_multi_lang_continue_pre_train \ |
|
--max_steps ${max_steps} \ |
|
--save_steps ${save_steps} \ |
|
--gradient_accumulation_steps 1 \ |
|
--logging_steps 50 \ |
|
--couninue_pre_train_data_files ${couninue_pre_train_data_files} \ |
|
--train_data_file=dataset/$lang/train.jsonl \ |
|
--eval_data_file=dataset/$lang/valid.jsonl \ |
|
--test_data_file=dataset/$lang/test.jsonl \ |
|
--codebase_file=dataset/$lang/codebase.jsonl \ |
|
--num_train_epochs ${epoch} \ |
|
--code_length ${code_length} \ |
|
--nl_length ${nl_length} \ |
|
--train_batch_size ${batch_size} \ |
|
--eval_batch_size 64 \ |
|
--learning_rate ${lr} \ |
|
--seed 123456 2>&1| tee ${output_dir}/save_tokenizer.log |
|
} |
|
|
|
|
|
continue_pre_train |
|
|