File size: 2,091 Bytes
b585c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash
# CHOOSE:
ngpus=4
# below has to match GPUs for A6000s due to long context tests
export TESTMODULOTOTAL=4

pip install pytest-instafail || true
docker ps | grep text-generation-inference | awk '{print $1}' | xargs docker stop
killall -s SIGINT pytest
killall -s SIGTERM pytest
killall -s 9 pytest
pkill --signal 9 -f weaviate-embedded/weaviate

NPHYSICAL=`lscpu -p | egrep -v '^\#' | sort -u -t, -k 2,4 | wc -l`
NPROCS=`lscpu -p | egrep -v '^\#' | wc -l`
#
n_jobs=$(($NPROCS / $TESTMODULOTOTAL))
echo "CORES: $NPHYSICAL $NPROCS $n_jobs"

# GENERAL:
lowergpuid=0
low=0
high=$(($TESTMODULOTOTAL-1))
pids=""
for mod in $(seq $low $high)
do
  # in some cases launch gradio server, TGI server, or gradio server as inference server with +1 and +2 off base port
  # ports always increment by 3
  export GRADIO_SERVER_PORT=$((7860+$(($mod*3))))
  export TESTMODULO=$mod

  # CVD loops over number of GPUs
  export CUDA_VISIBLE_DEVICES=$(($lowergpuid+$(($mod % $ngpus))))
  export n_jobs=$n_jobs
  export OMP_NUM_THREADS=$n_jobs
  export NUMEXPR_MAX_THREADS=$n_jobs
  export OPENBLAS_NUM_THREADS=$n_jobs
  # By default, OpenBLAS will restrict the Cpus_allowed to be 0x1.
  export OPENBLAS_MAIN_FREE=$n_jobs
  export MKL_NUM_THREADS=$n_jobs
  export H2OGPT_BASE_PATH="./base_$mod"

  # huggyllama test uses alot of memory, requires TESTMODULOTOTAL=ngpus for even A6000s
  # pytest --instafail -s -v -n 1 tests -k "not test_huggyllama_transformers_pr" &> testsparallel"${mod}".log &
  pytest --instafail -s -v -n 1 tests  &> testsparallel"${mod}".log &
  pid=$!
  echo "MODS: $mod $GRADIO_SERVER_PORT $CUDA_VISIBLE_DEVICES $H2OGPT_BASE_PATH"
  pids="$pids $pid"
done
trap "kill $pids; exit 1" INT

echo "to check on results while running, do:"
echo "grep -a PASSED testsparallel*.log | sed 's/.*PASSED//g' | sort | uniq |wc -l"
echo "grep -a FAILED testsparallel*.log | sed 's/.*FAILED//g' | sort | uniq |wc -l"

echo "to interrupt but still get some results, do:"
#echo "ps -auxwf | grep -v "[g]rep" | grep pytest | awk '{print $2}' |xargs kill -s SIGINT"
echo "kill -s SIGINT $pids"
wait