File size: 3,745 Bytes
186701e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash
if [[ $1 = "--help" ]] || [[ $1 = "-h" ]]
then
    echo "Usage: jizhi_run NUM_MECHINES NUM_GPUS TASK_NAME <CMDS>"
fi

# user configuration
TOKEN=$TOKEN
if [ ! -n "$IMAGE_FULL_NAME" ]; then
    IMAGE_FULL_NAME="mirrors.tencent.com/ronnysong_rd/fastdet:torch2.0.1-cuda11.7"
fi
if [ ! -n "$BUSINESS_FLAG" ]; then
    BUSINESS_FLAG="TEG_AILab_CVC_chongqing"
fi
if [ ! -n "$CEPH_BUSINESS_FLAG" ]; then
    CEPH_BUSINESS_FLAG="TEG_AILab_CVC_chongqing"
fi
if [ ! -n "$GPU_NAME" ]; then
    GPU_NAME="V100"
fi
if [ ! -n "$PRIORITY_LEVEL" ]; then
    PRIORITY_LEVEL="HIGH"
fi
if [ ! -n "$ELASTIC_LEVEL" ]; then
    ELASTIC_LEVEL=1
fi
if [ ! -n "$RDMA" ]; then
    RDMA="false"
fi
if [ ! -n "$CUDA" ]; then
    CUDA="11.0"
fi

CMD_PATH="start.sh"
CONF_PATH="jizhi_conf.json"
ROOT_PATH=$PWD
UUID=$(date +%s)

rm -f $CMD_PATH

echo 'cd '$ROOT_PATH >> $CMD_PATH
echo 'export HF_HOME="'$ROOT_PATH'/work_dirs/.cache/hf"' >> $CMD_PATH
echo 'export TORCH_HOME="'$ROOT_PATH'/work_dirs/.cache/torch"' >> $CMD_PATH
echo 'export CLIP_CACHE="'$ROOT_PATH'/work_dirs/.cache/clip"' >> $CMD_PATH
echo 'export TRANSFORMERS_CACHE="'$ROOT_PATH'/work_dirs/.cache/transformers"' >> $CMD_PATH
echo 'export MKL_NUM_THREADS=1' >> $CMD_PATH
echo 'export OMP_NUM_THREADS=1' >> $CMD_PATH
echo 'export TOKENIZERS_PARALLELISM=false' >> $CMD_PATH
echo 'export TORCH_DISTRIBUTED_DEBUG=INFO' >> $CMD_PATH
echo 'export NCCL_IB_GID_INDEX=3' >> $CMD_PATH
if [ $BUSINESS_FLAG = "TaiJi_HYAide_BUFFER_SH_A800H" ]; then
    echo 'export NCCL_IB_GID_INDEX=3' >> $CMD_PATH
    echo 'export NCCL_IB_SL=3' >> $CMD_PATH
    echo 'export NCCL_CHECKS_DISABLE=1' >> $CMD_PATH
    echo 'export NCCL_P2P_DISABLE=0' >> $CMD_PATH
    echo 'export NCCL_IB_DISABLE=0' >> $CMD_PATH
    echo 'export NCCL_LL_THRESHOLD=16384' >> $CMD_PATH
    echo 'export NCCL_IB_CUDA_SUPPORT=1' >> $CMD_PATH
    echo 'export NCCL_SOCKET_IFNAME=bond1' >> $CMD_PATH
    echo 'export UCX_NET_DEVICES=bond1' >> $CMD_PATH
    echo 'export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6' >> $CMD_PATH
    echo 'export NCCL_COLLNET_ENABLE=0' >> $CMD_PATH
    echo 'export SHARP_COLL_ENABLE_SAT=0' >> $CMD_PATH
    echo 'export NCCL_NET_GDR_LEVEL=2' >> $CMD_PATH
    echo 'export NCCL_IB_QPS_PER_CONNECTION=4' >> $CMD_PATH
    echo 'export NCCL_IB_TC=160' >> $CMD_PATH
    echo 'export NCCL_PXN_DISABLE=1' >> $CMD_PATH
fi
echo ${@:4} >> $CMD_PATH

chmod +x $CMD_PATH

rm -f $CONF_PATH

#INIT_CMD="jizhi_client mount -bf TEG_AILab_CVC_chongqing -tk $TOKEN"
INIT_CMD=""

echo '{' > $CONF_PATH
echo '"Token": "'$TOKEN'",' >> $CONF_PATH
echo '"business_flag": "'$BUSINESS_FLAG'",' >> $CONF_PATH
echo '"model_local_file_path": "'$ROOT_PATH'/'$CMD_PATH'",' >> $CONF_PATH
echo '"host_num": '$1',' >> $CONF_PATH
echo '"host_gpu_num": '$2',' >> $CONF_PATH
echo '"task_flag": "'$3'_'$UUID'",' >> $CONF_PATH
echo '"priority_level": "'$PRIORITY_LEVEL'",' >> $CONF_PATH
echo '"elastic_level": '$ELASTIC_LEVEL',' >> $CONF_PATH
echo '"cuda_version": "'$CUDA'",' >> $CONF_PATH
echo '"image_full_name": "'$IMAGE_FULL_NAME'",' >> $CONF_PATH
echo '"GPUName": "'$GPU_NAME'",' >> $CONF_PATH
echo '"mount_ceph_business_flag": "'$CEPH_BUSINESS_FLAG'",' >> $CONF_PATH
echo '"exec_start_in_all_mpi_pods": true,' >> $CONF_PATH
echo '"enable_rdma": '$RDMA',' >> $CONF_PATH
echo '"init_cmd": "'$INIT_CMD'",' >> $CONF_PATH
echo '"envs": {' >> $CONF_PATH
echo '    "HUNYUAN_TASK_CATEGORY": "LLM",' >> $CONF_PATH
echo '    "HUNYUAN_TASK_MODEL_TYPE": "SFT",' >> $CONF_PATH
echo '    "HUNYUAN_TASK_DOMAIN": "NLP",' >> $CONF_PATH
echo '    "HUNYUAN_TASK_START_MODEL_TYPE": "7B冷启"}' >> $CONF_PATH
echo '}' >> $CONF_PATH

jizhi_client start -scfg $CONF_PATH

rm -f $CMD_PATH
rm -f $CONF_PATH