train-scripts / infer_robust_2.sh
Ashton2000's picture
Upload folder using huggingface_hub
981b783 verified
# trap 'ssh wyt@${infer_address%%:*} "killall pt_main_thread"; exit' SIGINT
device=$1
deploy_flag=$2
step=$3
if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then
echo "Usage: $0 <device> <deploy_flag> <step>"
exit 1
fi
# model_path=/data/wyt/codes/DocDPO/sft/checkpoints_llama_factory/lora/ted_react_trans_base_sample_sft_dpolora_balanced_474/merged_fix/checkpoint-${step}
model_path=$4
language=$5
src_lang=${language%-*}
tgt_lang=${language#*-}
# infer_address=10.249.42.177:8010
# schedule_address=10.249.42.177:8011
# infer_address=127.0.0.1:801$infer_device
# schedule_address=127.0.0.1:801$schedule_device
# address=10.249.42.182:801${device}
# address=127.0.0.1:801${device}
address=10.249.45.139:801${device}
# setting=window_20_1ep
# setting=window_20_2ep_new
work_dir=/data/wyt/codes/DocDPO/inference_monolang/ted_en_zh_balanced_paritial
# data_dir=/data/wyt/codes/DocDPO/data/2017-01-ted-test
data_dir=/data/wyt/codes/DocDPO/data/ted_robust/level_3
output_dir=/data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/results_robust/level_3
if [ "$deploy_flag" = "true" ]; then
if [ "${address%%:*}" = "127.0.0.1" ]; then
source ~/.zshrc
conda activate vllm
CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching --gpu_memory_utilization 0.9 > vllm_${step}.log 2>&1 &
conda activate optima-vllm
else
ssh -n wyt@${address%%:*} "source ~/.zshrc && conda activate optima-vllm && CUDA_VISIBLE_DEVICES=${device} nohup vllm serve ${model_path} --host 0.0.0.0 --port ${address##*:} --served-model-name "qwen" --enable-prefix-caching > /dev/null 2>&1 &"
fi
fi
echo "Waiting for LLM deployment in 20 seconds..."
# sleep 20
echo "Testing API of ${address}..."
while true; do
python test_api.py $address
if [ $? -eq 0 ]; then
echo "API connected successfully!"
break
else
echo "API connection failed. Retrying in 5 seconds..."
sleep 5
fi
done
cur_path=`pwd`
cd $work_dir
for i in {6..11}; do
# for i in {0..11}; do
if [ ! -f "$output_dir/${src_lang}-${tgt_lang}_${step}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}" ]; then
echo IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i.${tgt_lang}
python -u infer.py \
--src_file $data_dir/${src_lang}-${tgt_lang}/IWSLT17.TED.tst2017.${src_lang}-${tgt_lang}.${src_lang}.$i \
--output_path $output_dir/${src_lang}-${tgt_lang}_${step} \
--window_size 10 \
--infer_address $address \
--schedule_address $address \
--language ${src_lang}-${tgt_lang} \
--infer_temperature 0.7 \
--schedule_temperature 0.7 \
--translate_style base
fi
done
cd $cur_path
# ssh wyt@${infer_address%%:*} "killall pt_main_thread"