| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -u |
| cd "$(dirname "$0")/.." |
|
|
| MODEL="${MODEL_DIR:-/path/to/Qwen3-235B-A22B-Instruct-2507-BF16}" |
| BIN="./build/qwen3-moe-aclnn" |
| N_RUNS="${1:-3}" |
| N_PREDICT="${N_PREDICT:-120}" |
| VOCAB="tokenizer_data/vocab.bin" |
|
|
| |
| default_prompts=( |
| "story|Once upon a time, in a small village," |
| "factual|The capital of France is" |
| "code|Write a Python function that computes Fibonacci." |
| "essay|The history of artificial intelligence spans several decades and" |
| ) |
|
|
| if [ "${2:-}" != "" ] && [ -f "${2:-}" ]; then |
| mapfile -t prompts < "$2" |
| else |
| prompts=("${default_prompts[@]}") |
| fi |
|
|
| |
| |
| |
| |
| |
| classify_output() { |
| awk ' |
| { |
| # Tokenize on whitespace; strip punct at edges for comparison. |
| n = split($0, w, /[[:space:]]+/); |
| for (i = 1; i <= n; i++) { |
| gsub(/^[[:punct:]]+|[[:punct:]]+$/, "", w[i]); |
| if (w[i] == "") continue; |
| words[++nw] = tolower(w[i]); |
| } |
| } |
| END { |
| if (nw < 5) { print "OK"; exit } |
| # consecutive-same detection |
| run = 1; max_run = 1; |
| for (i = 2; i <= nw; i++) { |
| if (words[i] == words[i-1]) { run++; if (run > max_run) max_run = run; } |
| else run = 1; |
| } |
| if (max_run >= 6) { printf "LOOP_%d\n", max_run; exit } |
| |
| # tail diversity: last 40 words |
| tail_start = nw - 39; if (tail_start < 1) tail_start = 1; |
| delete seen; |
| distinct = 0; |
| for (i = tail_start; i <= nw; i++) { |
| if (!(words[i] in seen)) { seen[words[i]] = 1; distinct++; } |
| } |
| tail_n = nw - tail_start + 1; |
| if (tail_n >= 20 && distinct < 10) { |
| printf "LOW_DIVERSITY_%d/%d\n", distinct, tail_n; |
| exit; |
| } |
| print "OK"; |
| }' |
| } |
|
|
| run_once() { |
| local prompt="$1" |
| local extra_flags="$2" |
| |
| |
| local stdout_file=$(mktemp) |
| local stderr_file=$(mktemp) |
| |
| ssh_cleanup_lockfile |
| ./scripts/tp_launch.sh 16 $BIN --model-dir "$MODEL" \ |
| --prompt "$prompt" --n-predict $N_PREDICT \ |
| --vocab "$VOCAB" --seed 0 --no-stream --temperature 0 \ |
| $extra_flags 1>"$stdout_file" 2>"$stderr_file" |
| |
| local tg=$(grep "\[perf\] decode" "$stdout_file" | awk '{print $(NF-2)}') |
| |
| local gen_text=$(grep -F -- "$prompt" "$stdout_file" | grep -v '^\[' | tail -1) |
| local stripped="${gen_text#$prompt}" |
| local verdict=$(echo "$stripped" | classify_output) |
| local has_warn="" |
| if grep -q "\[warn\]" "$stderr_file"; then has_warn="WARN"; fi |
| local pld_line=$(grep "\[pld\]" "$stderr_file" | tail -1 | sed 's/^\[pld\] //') |
| rm -f "$stdout_file" "$stderr_file" |
| echo "${tg:-0}|${verdict}|${has_warn}|${pld_line}" |
| } |
|
|
| ssh_cleanup_lockfile() { |
| rm -f /tmp/hccl_root_info.bin 2>/dev/null || true |
| } |
|
|
| bench_prompt() { |
| local tag="$1"; local prompt="$2"; local flags="$3" |
| echo "" |
| echo "=== [$tag] $(echo "$prompt" | head -c 50)... (flags: ${flags:-none}) ===" |
| local tgs=() verdicts=() warns=() plds=() |
| for r in $(seq 1 $N_RUNS); do |
| result=$(run_once "$prompt" "$flags") |
| IFS='|' read -r tg verdict warn pld <<< "$result" |
| printf " run %d: TG=%s verdict=%s %s\n" "$r" "$tg" "$verdict" "$warn" |
| [ -n "$pld" ] && printf " %s\n" "$pld" |
| tgs+=("${tg:-0}"); verdicts+=("$verdict"); warns+=("$warn") |
| rm -f /tmp/hccl_root_info.bin |
| done |
| |
| local good_tgs=() bad_tgs=() |
| for i in "${!tgs[@]}"; do |
| if [ "${verdicts[$i]}" = "OK" ]; then good_tgs+=("${tgs[$i]}"); else bad_tgs+=("${tgs[$i]}"); fi |
| done |
| local n_good=${#good_tgs[@]} |
| local n_bad=${#bad_tgs[@]} |
| echo " β $n_good/$N_RUNS OK, $n_bad/$N_RUNS degraded" |
| if [ $n_good -gt 0 ]; then |
| local mean=$(printf '%s\n' "${good_tgs[@]}" | awk '{s+=$1} END {printf "%.2f", s/NR}') |
| echo " β OK mean TG: $mean t/s (values: ${good_tgs[*]})" |
| fi |
| if [ $n_bad -gt 0 ]; then |
| local bad_mean=$(printf '%s\n' "${bad_tgs[@]}" | awk '{s+=$1} END {printf "%.2f", s/NR}') |
| echo " β degraded mean TG: $bad_mean t/s (DO NOT REPORT as speedup) (values: ${bad_tgs[*]})" |
| fi |
| } |
|
|
| echo "bench_pld_safe: $N_RUNS runs Γ $N_PREDICT tokens per prompt; comparing [no-pld, pld+guard, pld+no-guard]" |
|
|
| for entry in "${prompts[@]}"; do |
| tag="${entry%%|*}" |
| prompt="${entry#*|}" |
| bench_prompt "$tag/base" "$prompt" "" |
| bench_prompt "$tag/pld+guard" "$prompt" "--pld" |
| bench_prompt "$tag/pld-raw" "$prompt" "--pld --pld-no-guard" |
| done |
|
|
| echo "" |
| echo "==========================================================" |
| echo "Interpretation:" |
| echo " OK mean TG is the only honest number to report." |
| echo " Any 'degraded' result with high TG is a dead-loop artifact." |
| echo " Expected: pld+guard matches or beats base on creative/story prompts," |
| echo " matches base on factual/code prompts (drafts rejected β fallback to single decode)." |
| echo " pld-raw (no guard) on repetitive prompts produces 'degraded' with high TG." |
|
|