File size: 1,810 Bytes
51c57f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
lang=ruby
current_time=$(date "+%Y%m%d%H%M%S")
# current_time=tmp

code_length=64
nl_length=64

model_type=multi-loss-cocosoda #"base", "cocosoda" 
moco_k=1024
moco_m=0.999
lr=2e-5
moco_t=0.07

epoch=10
batch_size=128
max_steps=100000
save_steps=1000
data_aug_type="replace_type"
couninue_pre_train_data_files='dataset/java/train.jsonl dataset/javascript/train.jsonl  dataset/python/train.jsonl  dataset/php/train.jsonl  dataset/go/train.jsonl dataset/ruby/train.jsonl'
CUDA_VISIBLE_DEVICES="0,1"
base_model=unixcoder

function continue_pre_train () {
output_dir=./saved_models/cocosoda/
mkdir -p $output_dir
echo ${output_dir}
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}  python run.py  --eval_frequency  100 \
    --moco_m ${moco_m} --moco_t ${moco_t}  \
    --output_dir ${output_dir}  \
    --moco_k ${moco_k} \
    --model_type ${model_type} \
    --data_aug_type other \
    --config_name=microsoft/${base_model}-base  \
    --model_name_or_path=microsoft/${base_model}-base \
    --tokenizer_name=microsoft/${base_model}-base \
    --lang=$lang \
    --do_test \
    --time_score 1 \
    --do_multi_lang_continue_pre_train \
    --max_steps ${max_steps} \
    --save_steps ${save_steps} \
    --gradient_accumulation_steps 1 \
    --logging_steps 50 \
    --couninue_pre_train_data_files  ${couninue_pre_train_data_files} \
    --train_data_file=dataset/$lang/train.jsonl \
    --eval_data_file=dataset/$lang/valid.jsonl  \
    --test_data_file=dataset/$lang/test.jsonl \
    --codebase_file=dataset/$lang/codebase.jsonl \
    --num_train_epochs ${epoch} \
    --code_length ${code_length} \
    --nl_length ${nl_length} \
    --train_batch_size ${batch_size} \
    --eval_batch_size 64 \
    --learning_rate ${lr} \
    --seed 123456 2>&1| tee ${output_dir}/save_tokenizer.log
}


continue_pre_train