stevengrove commited on
Commit
54cb9a9
1 Parent(s): e6a8b63

Delete taiji

Browse files
Files changed (4) hide show
  1. taiji/drun +0 -35
  2. taiji/erun +0 -23
  3. taiji/etorchrun +0 -51
  4. taiji/jizhi_run_vanilla +0 -105
taiji/drun DELETED
@@ -1,35 +0,0 @@
1
- #!/bin/bash
2
- DOCKER_IMAGE="mirrors.tencent.com/ronnysong_rd/fastdet:torch2.0.1-cuda11.7"
3
-
4
- if [ ! -n "$DEBUG" ]; then
5
- COMMAND_PREFIX="pip3 install -e ."
6
- else
7
- COMMAND_PREFIX="pip3 install -q -e third_party/mmengine;
8
- pip3 install -q -e third_party/mmdetection;
9
- pip3 install -q -e third_party/mmcv;
10
- pip3 install -q -e third_party/mmyolo;
11
- pip3 install -q -e ."
12
- fi
13
-
14
- sudo nvidia-docker run \
15
- --rm \
16
- -it \
17
- -e NVIDIA_VISIBLE_DEVICES=all \
18
- --env="DISPLAY" \
19
- --env="QT_X11_NO_MITSHM=1" \
20
- --volume="$HOME/.Xauthority:/root/.Xauthority:rw" \
21
- --shm-size=20gb \
22
- --network=host \
23
- -v /apdcephfs/:/apdcephfs/ \
24
- -v /apdcephfs_cq2/:/apdcephfs_cq2/ \
25
- -v /apdcephfs_cq3/:/apdcephfs_cq3/ \
26
- -v /data/:/data/ \
27
- -w $PWD \
28
- $DOCKER_IMAGE \
29
- bash -c "export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers;
30
- export TORCH_HOME=$PWD/work_dirs/.cache/torch;
31
- export CLIP_CACHE=$PWD/work_dirs/.cache/clip;
32
- export HF_HOME=$PWD/work_dirs/.cache/hf;
33
- export TOKENIZERS_PARALLELISM=false;
34
- $COMMAND_PREFIX
35
- $*"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
taiji/erun DELETED
@@ -1,23 +0,0 @@
1
- #!/bin/bash
2
- export NCCL_IB_GID_INDEX=3
3
-
4
- export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers
5
- export TORCH_HOME=$PWD/work_dirs/.cache/torch
6
- export CLIP_CACHE=$PWD/work_dirs/.cache/clip
7
- export HF_HOME=$PWD/work_dirs/.cache/hf
8
- export TOKENIZERS_PARALLELISM=false
9
- export MKL_NUM_THREADS=1
10
- export OMP_NUM_THREADS=1
11
- export TORCH_DISTRIBUTED_DEBUG=INFO
12
- export HF_DATASETS_OFFLINE=1
13
- export TRANSFORMERS_OFFLINE=1
14
- export http_proxy="http://star-proxy.oa.com:3128"
15
- export https_proxy="http://star-proxy.oa.com:3128"
16
- export ftp_proxy="http://star-proxy.oa.com:3128"
17
- export no_proxy=".woa.com,mirrors.cloud.tencent.com,tlinux-mirror.tencent-cloud.com,tlinux-mirrorlist.tencent-cloud.com,localhost,127.0.0.1,mirrors-tlinux.tencentyun.com,.oa.com,.local,.3gqq.com,.7700.org,.ad.com,.ada_sixjoy.com,.addev.com,.app.local,.apps.local,.aurora.com,.autotest123.com,.bocaiwawa.com,.boss.com,.cdc.com,.cdn.com,.cds.com,.cf.com,.cjgc.local,.cm.com,.code.com,.datamine.com,.dvas.com,.dyndns.tv,.ecc.com,.expochart.cn,.expovideo.cn,.fms.com,.great.com,.hadoop.sec,.heme.com,.home.com,.hotbar.com,.ibg.com,.ied.com,.ieg.local,.ierd.com,.imd.com,.imoss.com,.isd.com,.isoso.com,.itil.com,.kao5.com,.kf.com,.kitty.com,.lpptp.com,.m.com,.matrix.cloud,.matrix.net,.mickey.com,.mig.local,.mqq.com,.oiweb.com,.okbuy.isddev.com,.oss.com,.otaworld.com,.paipaioa.com,.qqbrowser.local,.qqinternal.com,.qqwork.com,.rtpre.com,.sc.oa.com,.sec.com,.server.com,.service.com,.sjkxinternal.com,.sllwrnm5.cn,.sng.local,.soc.com,.t.km,.tcna.com,.teg.local,.tencentvoip.com,.tenpayoa.com,.test.air.tenpay.com,.tr.com,.tr_autotest123.com,.vpn.com,.wb.local,.webdev.com,.webdev2.com,.wizard.com,.wqq.com,.wsd.com,.sng.com,.music.lan,.mnet2.com,.tencentb2.com,.tmeoa.com,.pcg.com,www.wip3.adobe.com,www-mm.wip3.adobe.com,mirrors.tencent.com,csighub.tencentyun.com"
18
- sed -i 's/np.float/float/g' /usr/local/python/lib/python3.8/site-packages/lvis/eval.py
19
- touch /tmp/.unhold
20
-
21
- pip3 install -e .
22
- $*
23
- rm /tmp/.unhold
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
taiji/etorchrun DELETED
@@ -1,51 +0,0 @@
1
- #!/bin/bash
2
- if [ ! -n "$SH" ]; then
3
- #export NCCL_IB_GID_INDEX=3
4
- export NCCL_IB_DISABLE=1
5
- export NCCL_P2P_DISABLE=1
6
- export NCCL_SOCKET_IFNAME=eth1
7
- else
8
- export NCCL_IB_GID_INDEX=3
9
- export NCCL_IB_SL=3
10
- export NCCL_CHECKS_DISABLE=1
11
- export NCCL_P2P_DISABLE=0
12
- export NCCL_IB_DISABLE=0
13
- export NCCL_LL_THRESHOLD=16384
14
- export NCCL_IB_CUDA_SUPPORT=1
15
- export NCCL_SOCKET_IFNAME=bond1
16
- export UCX_NET_DEVICES=bond1
17
- export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6
18
- export NCCL_COLLNET_ENABLE=0
19
- export SHARP_COLL_ENABLE_SAT=0
20
- export NCCL_NET_GDR_LEVEL=2
21
- export NCCL_IB_QPS_PER_CONNECTION=4
22
- export NCCL_IB_TC=160
23
- export NCCL_PXN_DISABLE=1
24
- export GLOO_SOCKET_IFNAME=bond1
25
- export NCCL_DEBUG=info
26
- fi
27
-
28
- export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers
29
- export TORCH_HOME=$PWD/work_dirs/.cache/torch
30
- export CLIP_CACHE=$PWD/work_dirs/.cache/clip
31
- export HF_HOME=$PWD/work_dirs/.cache/hf
32
- export TOKENIZERS_PARALLELISM=false
33
- export MKL_NUM_THREADS=1
34
- export OMP_NUM_THREADS=1
35
- export TORCH_DISTRIBUTED_DEBUG=INFO
36
- export HF_DATASETS_OFFLINE=1
37
- export TRANSFORMERS_OFFLINE=1
38
-
39
- export http_proxy="http://star-proxy.oa.com:3128"
40
- export https_proxy="http://star-proxy.oa.com:3128"
41
- export ftp_proxy="http://star-proxy.oa.com:3128"
42
- export no_proxy=".woa.com,mirrors.cloud.tencent.com,tlinux-mirror.tencent-cloud.com,tlinux-mirrorlist.tencent-cloud.com,localhost,127.0.0.1,mirrors-tlinux.tencentyun.com,.oa.com,.local,.3gqq.com,.7700.org,.ad.com,.ada_sixjoy.com,.addev.com,.app.local,.apps.local,.aurora.com,.autotest123.com,.bocaiwawa.com,.boss.com,.cdc.com,.cdn.com,.cds.com,.cf.com,.cjgc.local,.cm.com,.code.com,.datamine.com,.dvas.com,.dyndns.tv,.ecc.com,.expochart.cn,.expovideo.cn,.fms.com,.great.com,.hadoop.sec,.heme.com,.home.com,.hotbar.com,.ibg.com,.ied.com,.ieg.local,.ierd.com,.imd.com,.imoss.com,.isd.com,.isoso.com,.itil.com,.kao5.com,.kf.com,.kitty.com,.lpptp.com,.m.com,.matrix.cloud,.matrix.net,.mickey.com,.mig.local,.mqq.com,.oiweb.com,.okbuy.isddev.com,.oss.com,.otaworld.com,.paipaioa.com,.qqbrowser.local,.qqinternal.com,.qqwork.com,.rtpre.com,.sc.oa.com,.sec.com,.server.com,.service.com,.sjkxinternal.com,.sllwrnm5.cn,.sng.local,.soc.com,.t.km,.tcna.com,.teg.local,.tencentvoip.com,.tenpayoa.com,.test.air.tenpay.com,.tr.com,.tr_autotest123.com,.vpn.com,.wb.local,.webdev.com,.webdev2.com,.wizard.com,.wqq.com,.wsd.com,.sng.com,.music.lan,.mnet2.com,.tencentb2.com,.tmeoa.com,.pcg.com,www.wip3.adobe.com,www-mm.wip3.adobe.com,mirrors.tencent.com,csighub.tencentyun.com"
43
-
44
- sed -i 's/np.float/float/g' /usr/local/python/lib/python3.8/site-packages/lvis/eval.py
45
-
46
- touch /tmp/.unhold
47
-
48
- pip3 install -e .
49
- torchrun --nnodes=$1 --nproc_per_node=$2 --node_rank=$INDEX --master_addr=$CHIEF_IP ${@:3}
50
-
51
- rm /tmp/.unhold
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
taiji/jizhi_run_vanilla DELETED
@@ -1,105 +0,0 @@
1
- #!/bin/bash
2
- if [[ $1 = "--help" ]] || [[ $1 = "-h" ]]
3
- then
4
- echo "Usage: jizhi_run NUM_MECHINES NUM_GPUS TASK_NAME <CMDS>"
5
- fi
6
-
7
- # user configuration
8
- TOKEN=$TOKEN
9
- if [ ! -n "$IMAGE_FULL_NAME" ]; then
10
- IMAGE_FULL_NAME="mirrors.tencent.com/ronnysong_rd/fastdet:torch2.0.1-cuda11.7"
11
- fi
12
- if [ ! -n "$BUSINESS_FLAG" ]; then
13
- BUSINESS_FLAG="TEG_AILab_CVC_chongqing"
14
- fi
15
- if [ ! -n "$CEPH_BUSINESS_FLAG" ]; then
16
- CEPH_BUSINESS_FLAG="TEG_AILab_CVC_chongqing"
17
- fi
18
- if [ ! -n "$GPU_NAME" ]; then
19
- GPU_NAME="V100"
20
- fi
21
- if [ ! -n "$PRIORITY_LEVEL" ]; then
22
- PRIORITY_LEVEL="HIGH"
23
- fi
24
- if [ ! -n "$ELASTIC_LEVEL" ]; then
25
- ELASTIC_LEVEL=1
26
- fi
27
- if [ ! -n "$RDMA" ]; then
28
- RDMA="false"
29
- fi
30
- if [ ! -n "$CUDA" ]; then
31
- CUDA="11.0"
32
- fi
33
-
34
- CMD_PATH="start.sh"
35
- CONF_PATH="jizhi_conf.json"
36
- ROOT_PATH=$PWD
37
- UUID=$(date +%s)
38
-
39
- rm -f $CMD_PATH
40
-
41
- echo 'cd '$ROOT_PATH >> $CMD_PATH
42
- echo 'export HF_HOME="'$ROOT_PATH'/work_dirs/.cache/hf"' >> $CMD_PATH
43
- echo 'export TORCH_HOME="'$ROOT_PATH'/work_dirs/.cache/torch"' >> $CMD_PATH
44
- echo 'export CLIP_CACHE="'$ROOT_PATH'/work_dirs/.cache/clip"' >> $CMD_PATH
45
- echo 'export TRANSFORMERS_CACHE="'$ROOT_PATH'/work_dirs/.cache/transformers"' >> $CMD_PATH
46
- echo 'export MKL_NUM_THREADS=1' >> $CMD_PATH
47
- echo 'export OMP_NUM_THREADS=1' >> $CMD_PATH
48
- echo 'export TOKENIZERS_PARALLELISM=false' >> $CMD_PATH
49
- echo 'export TORCH_DISTRIBUTED_DEBUG=INFO' >> $CMD_PATH
50
- echo 'export NCCL_IB_GID_INDEX=3' >> $CMD_PATH
51
- if [ $BUSINESS_FLAG = "TaiJi_HYAide_BUFFER_SH_A800H" ]; then
52
- echo 'export NCCL_IB_GID_INDEX=3' >> $CMD_PATH
53
- echo 'export NCCL_IB_SL=3' >> $CMD_PATH
54
- echo 'export NCCL_CHECKS_DISABLE=1' >> $CMD_PATH
55
- echo 'export NCCL_P2P_DISABLE=0' >> $CMD_PATH
56
- echo 'export NCCL_IB_DISABLE=0' >> $CMD_PATH
57
- echo 'export NCCL_LL_THRESHOLD=16384' >> $CMD_PATH
58
- echo 'export NCCL_IB_CUDA_SUPPORT=1' >> $CMD_PATH
59
- echo 'export NCCL_SOCKET_IFNAME=bond1' >> $CMD_PATH
60
- echo 'export UCX_NET_DEVICES=bond1' >> $CMD_PATH
61
- echo 'export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6' >> $CMD_PATH
62
- echo 'export NCCL_COLLNET_ENABLE=0' >> $CMD_PATH
63
- echo 'export SHARP_COLL_ENABLE_SAT=0' >> $CMD_PATH
64
- echo 'export NCCL_NET_GDR_LEVEL=2' >> $CMD_PATH
65
- echo 'export NCCL_IB_QPS_PER_CONNECTION=4' >> $CMD_PATH
66
- echo 'export NCCL_IB_TC=160' >> $CMD_PATH
67
- echo 'export NCCL_PXN_DISABLE=1' >> $CMD_PATH
68
- fi
69
- echo ${@:4} >> $CMD_PATH
70
-
71
- chmod +x $CMD_PATH
72
-
73
- rm -f $CONF_PATH
74
-
75
- #INIT_CMD="jizhi_client mount -bf TEG_AILab_CVC_chongqing -tk $TOKEN"
76
- INIT_CMD=""
77
-
78
- echo '{' > $CONF_PATH
79
- echo '"Token": "'$TOKEN'",' >> $CONF_PATH
80
- echo '"business_flag": "'$BUSINESS_FLAG'",' >> $CONF_PATH
81
- echo '"model_local_file_path": "'$ROOT_PATH'/'$CMD_PATH'",' >> $CONF_PATH
82
- echo '"host_num": '$1',' >> $CONF_PATH
83
- echo '"host_gpu_num": '$2',' >> $CONF_PATH
84
- echo '"task_flag": "'$3'_'$UUID'",' >> $CONF_PATH
85
- echo '"priority_level": "'$PRIORITY_LEVEL'",' >> $CONF_PATH
86
- echo '"elastic_level": '$ELASTIC_LEVEL',' >> $CONF_PATH
87
- echo '"cuda_version": "'$CUDA'",' >> $CONF_PATH
88
- echo '"image_full_name": "'$IMAGE_FULL_NAME'",' >> $CONF_PATH
89
- echo '"GPUName": "'$GPU_NAME'",' >> $CONF_PATH
90
- echo '"mount_ceph_business_flag": "'$CEPH_BUSINESS_FLAG'",' >> $CONF_PATH
91
- echo '"exec_start_in_all_mpi_pods": true,' >> $CONF_PATH
92
- echo '"enable_rdma": '$RDMA',' >> $CONF_PATH
93
- echo '"init_cmd": "'$INIT_CMD'",' >> $CONF_PATH
94
- echo '"envs": {' >> $CONF_PATH
95
- echo ' "HUNYUAN_TASK_CATEGORY": "LLM",' >> $CONF_PATH
96
- echo ' "HUNYUAN_TASK_MODEL_TYPE": "SFT",' >> $CONF_PATH
97
- echo ' "HUNYUAN_TASK_DOMAIN": "NLP",' >> $CONF_PATH
98
- echo ' "HUNYUAN_TASK_START_MODEL_TYPE": "7B冷启"}' >> $CONF_PATH
99
- echo '}' >> $CONF_PATH
100
-
101
- jizhi_client start -scfg $CONF_PATH
102
-
103
- rm -f $CMD_PATH
104
- rm -f $CONF_PATH
105
-