Spaces:
Runtime error
Runtime error
File size: 6,323 Bytes
f6ff4fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
#!/usr/bin/env bash
# sh run.sh --stage -1 --stop_stage 2 --system_version centos --pretrained_model_name bloom-1b4-zh --final_model_name bloom-1b4-sft
# sh run.sh --stage -1 --stop_stage 1 --system_version centos --pretrained_model_name bloom-1b4-zh
# sh run.sh --stage 1 --stop_stage 1 --system_version centos --pretrained_model_name bloom-1b4-zh
# sh run.sh --stage 2 --stop_stage 2 --system_version centos --pretrained_model_name bloom-1b4-zh --final_model_name bloom-1b4-sft
# sh run.sh --stage 1 --stop_stage 1 --system_version windows --pretrained_model_name bloom-1b4-zh
# params
system_version="windows";
verbose=true;
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
pretrained_model_supplier=YeungNLP
#pretrained_model_name=bloom-396m-zh
#pretrained_model_name=bloom-820m-zh
pretrained_model_name=bloom-1b4-zh
final_model_name=final_model_name
patience=0
# parse options
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
--*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
old_value="(eval echo \\$$name)";
if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval "${name}=\"$2\"";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
$verbose && echo "system_version: ${system_version}"
work_dir="$(pwd)"
file_dir="${work_dir}/file_dir"
cache_dir="${file_dir}/cache_dir"
serialization_dir="${file_dir}/serialization_dir"
data_dir="/data/tianxing/PycharmProjects/datasets/firefly_train_1_1m"
pretrained_models_dir="${work_dir}/../../../pretrained_models/huggingface/${pretrained_model_supplier}"
final_model_dir="${work_dir}/../../../trained_models/${final_model_name}";
mkdir -p "${file_dir}"
mkdir -p "${cache_dir}"
mkdir -p "${serialization_dir}"
mkdir -p "${data_dir}"
mkdir -p "${pretrained_models_dir}"
mkdir -p "${final_model_dir}"
export PYTHONPATH="${work_dir}/../../.."
if [ $system_version == "windows" ]; then
alias python3='C:/Users/tianx/PycharmProjects/virtualenv/Transformers/Scripts/python.exe'
elif [ $system_version == "centos" ]; then
# conda activate Transformers
alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
elif [ $system_version == "ubuntu" ]; then
# conda activate Transformers
alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
fi
function search_best_ckpt() {
patience="$1";
cd "${serialization_dir}" || exit 1
last_epoch=$(ls . | \
grep "checkpoint-*" | \
awk -F'[-]' '{print$2}' | \
sort -n | \
awk 'END {print}')
target_dir=
if [ -n "${last_epoch}" ]; then
target_epoch=$((last_epoch - patience))
for epoch_idx in $(ls . | grep "checkpoint-*" | awk -F'[-]' '{print$2}' | sort -nr):
do
if [ "${epoch_idx}" -le "${target_epoch}" ]; then
target_dir="checkpoint-${epoch_idx}";
break;
fi
done
fi
echo "${target_dir}"
}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
$verbose && echo "stage -1: download data"
cd "${data_dir}" || exit 1;
firefly_train_1_1m_size=$(/bin/ls -l firefly-train-1.1M.jsonl | awk '{print $5}')
if [ ! -e firefly-train-1.1M.jsonl ] || [ "${firefly_train_1_1m_size}" != "1171119212" ]; then
# rm firefly-train-1.1M.jsonl
wget -c https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M/resolve/main/firefly-train-1.1M.jsonl
fi
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
$verbose && echo "stage 0: download pretrained model"
cd "${work_dir}" || exit 1;
cd "${pretrained_models_dir}" || exit 1;
if [ ! -d "${pretrained_model_name}" ]; then
git clone "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/"
cd "${pretrained_models_dir}/${pretrained_model_name}" || exit 1;
rm -rf .git
rm -rf flax_model.msgpack
rm -rf model.safetensors
rm -rf pytorch_model.bin
rm -rf tokenizer.json
fi
cd "${pretrained_models_dir}/${pretrained_model_name}" || exit 1;
if [ ! -e pytorch_model.bin ]; then
wget -c "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/resolve/main/pytorch_model.bin"
fi
if [ ! -e tokenizer.json ]; then
wget -c "https://huggingface.co/${pretrained_model_supplier}/${pretrained_model_name}/resolve/main/tokenizer.json"
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
$verbose && echo "stage 1: train model"
cd "${work_dir}" || exit 1;
target_dir=$(search_best_ckpt "${patience}");
resume_from_checkpoint=
if [ -n "${target_dir}" ]; then
resume_from_checkpoint="${serialization_dir}/${target_dir}"
echo "resume_from_checkpoint: ${resume_from_checkpoint}"
fi
python3 1.train_model.py \
--train_file "${data_dir}/firefly-train-1.1M.jsonl" \
--pretrained_model_name_or_path "${pretrained_models_dir}/${pretrained_model_name}" \
--output_dir "${serialization_dir}" \
--cache_dir "${cache_dir}" \
--fp16 \
${resume_from_checkpoint:+--resume_from_checkpoint $resume_from_checkpoint}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
$verbose && echo "stage 2: collect files"
target_dir=$(search_best_ckpt "${patience}");
cd "${work_dir}" || exit 1;
cp "${serialization_dir}/${target_dir}/pytorch_model.bin" "${final_model_dir}/pytorch_model.bin"
cp "${pretrained_models_dir}/${pretrained_model_name}/config.json" "${final_model_dir}/config.json"
cp "${pretrained_models_dir}/${pretrained_model_name}/special_tokens_map.json" "${final_model_dir}/special_tokens_map.json"
cp "${pretrained_models_dir}/${pretrained_model_name}/tokenizer_config.json" "${final_model_dir}/tokenizer_config.json"
cp "${pretrained_models_dir}/${pretrained_model_name}/tokenizer.json" "${final_model_dir}/tokenizer.json"
fi
|