llm_mutil_npu / scripts /bench_pld_safe.sh

Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 19 days ago

6.26 kB

	#!/usr/bin/env bash
	# bench_pld_safe.sh — PLD benchmark with output correctness check.
	# Unlike bench_tg.sh (which only reports TG numbers), this wrapper also inspects the
	# generated text for degeneration signals (consecutive identical tokens / very low
	# distinct-token ratio in the tail) and flags runs whose high TG came from dead-loop
	# output rather than real acceleration.
	#
	# Usage: ./scripts/bench_pld_safe.sh [N_RUNS] [PROMPT_FILE]
	# Prompts with "\|" separator: "tag\|prompt text"
	# Default: tests multiple prompt classes and reports which ones PLD helps safely.
	set -u
	cd "$(dirname "$0")/.."

	MODEL="${MODEL_DIR:-/path/to/Qwen3-235B-A22B-Instruct-2507-BF16}"
	BIN="./build/qwen3-moe-aclnn"
	N_RUNS="${1:-3}"
	N_PREDICT="${N_PREDICT:-120}"
	VOCAB="tokenizer_data/vocab.bin"

	# Default prompt suite: one per class. Override via PROMPTS env or arg 2 (file with "tag\|prompt" per line).
	default_prompts=(
	"story\|Once upon a time, in a small village,"
	"factual\|The capital of France is"
	"code\|Write a Python function that computes Fibonacci."
	"essay\|The history of artificial intelligence spans several decades and"
	)

	if [ "${2:-}" != "" ] && [ -f "${2:-}" ]; then
	mapfile -t prompts < "$2"
	else
	prompts=("${default_prompts[@]}")
	fi

	# ----- Correctness classifier -----
	# Reads generated text from stdin, returns:
	# OK — no loop signals
	# LOOP_N — N+ consecutive identical non-space words detected
	# LOW_DIVERSITY — tail 40 words have < 10 distinct words (heavy repetition)
	classify_output() {
	awk '
	{
	# Tokenize on whitespace; strip punct at edges for comparison.
	n = split($0, w, /[[:space:]]+/);
	for (i = 1; i <= n; i++) {
	gsub(/^[[:punct:]]+\|[[:punct:]]+$/, "", w[i]);
	if (w[i] == "") continue;
	words[++nw] = tolower(w[i]);
	}
	}
	END {
	if (nw < 5) { print "OK"; exit }
	# consecutive-same detection
	run = 1; max_run = 1;
	for (i = 2; i <= nw; i++) {
	if (words[i] == words[i-1]) { run++; if (run > max_run) max_run = run; }
	else run = 1;
	}
	if (max_run >= 6) { printf "LOOP_%d\n", max_run; exit }

	# tail diversity: last 40 words
	tail_start = nw - 39; if (tail_start < 1) tail_start = 1;
	delete seen;
	distinct = 0;
	for (i = tail_start; i <= nw; i++) {
	if (!(words[i] in seen)) { seen[words[i]] = 1; distinct++; }
	}
	tail_n = nw - tail_start + 1;
	if (tail_n >= 20 && distinct < 10) {
	printf "LOW_DIVERSITY_%d/%d\n", distinct, tail_n;
	exit;
	}
	print "OK";
	}'
	}

	run_once() {
	local prompt="$1"
	local extra_flags="$2"
	# Launch. The binary prints to stdout: rank/cli headers, runner loading lines,
	# generated text (--no-stream), then perf lines. pld/warn go to stderr.
	local stdout_file=$(mktemp)
	local stderr_file=$(mktemp)
	# Ensure no lockfile leftover.
	ssh_cleanup_lockfile
	./scripts/tp_launch.sh 16 $BIN --model-dir "$MODEL" \
	--prompt "$prompt" --n-predict $N_PREDICT \
	--vocab "$VOCAB" --seed 0 --no-stream --temperature 0 \
	$extra_flags 1>"$stdout_file" 2>"$stderr_file"
	# TG lives on stdout (from printf in binary).
	local tg=$(grep "\[perf\] decode" "$stdout_file" \| awk '{print $(NF-2)}')
	# Generated text: the line that begins with the prompt (--no-stream echoes prompt+text).
	local gen_text=$(grep -F -- "$prompt" "$stdout_file" \| grep -v '^\[' \| tail -1)
	local stripped="${gen_text#$prompt}"
	local verdict=$(echo "$stripped" \| classify_output)
	local has_warn=""
	if grep -q "\[warn\]" "$stderr_file"; then has_warn="WARN"; fi
	local pld_line=$(grep "\[pld\]" "$stderr_file" \| tail -1 \| sed 's/^\[pld\] //')
	rm -f "$stdout_file" "$stderr_file"
	echo "${tg:-0}\|${verdict}\|${has_warn}\|${pld_line}"
	}

	ssh_cleanup_lockfile() {
	rm -f /tmp/hccl_root_info.bin 2>/dev/null \|\| true
	}

	bench_prompt() {
	local tag="$1"; local prompt="$2"; local flags="$3"
	echo ""
	echo "=== [$tag] $(echo "$prompt" \| head -c 50)... (flags: ${flags:-none}) ==="
	local tgs=() verdicts=() warns=() plds=()
	for r in $(seq 1 $N_RUNS); do
	result=$(run_once "$prompt" "$flags")
	IFS='\|' read -r tg verdict warn pld <<< "$result"
	printf " run %d: TG=%s verdict=%s %s\n" "$r" "$tg" "$verdict" "$warn"
	[ -n "$pld" ] && printf " %s\n" "$pld"
	tgs+=("${tg:-0}"); verdicts+=("$verdict"); warns+=("$warn")
	rm -f /tmp/hccl_root_info.bin
	done
	# Split good vs degraded
	local good_tgs=() bad_tgs=()
	for i in "${!tgs[@]}"; do
	if [ "${verdicts[$i]}" = "OK" ]; then good_tgs+=("${tgs[$i]}"); else bad_tgs+=("${tgs[$i]}"); fi
	done
	local n_good=${#good_tgs[@]}
	local n_bad=${#bad_tgs[@]}
	echo " → $n_good/$N_RUNS OK, $n_bad/$N_RUNS degraded"
	if [ $n_good -gt 0 ]; then
	local mean=$(printf '%s\n' "${good_tgs[@]}" \| awk '{s+=$1} END {printf "%.2f", s/NR}')
	echo " → OK mean TG: $mean t/s (values: ${good_tgs[*]})"
	fi
	if [ $n_bad -gt 0 ]; then
	local bad_mean=$(printf '%s\n' "${bad_tgs[@]}" \| awk '{s+=$1} END {printf "%.2f", s/NR}')
	echo " → degraded mean TG: $bad_mean t/s (DO NOT REPORT as speedup) (values: ${bad_tgs[*]})"
	fi
	}

	echo "bench_pld_safe: $N_RUNS runs × $N_PREDICT tokens per prompt; comparing [no-pld, pld+guard, pld+no-guard]"

	for entry in "${prompts[@]}"; do
	tag="${entry%%\|*}"
	prompt="${entry#*\|}"
	bench_prompt "$tag/base" "$prompt" ""
	bench_prompt "$tag/pld+guard" "$prompt" "--pld"
	bench_prompt "$tag/pld-raw" "$prompt" "--pld --pld-no-guard"
	done

	echo ""
	echo "=========================================================="
	echo "Interpretation:"
	echo " OK mean TG is the only honest number to report."
	echo " Any 'degraded' result with high TG is a dead-loop artifact."
	echo " Expected: pld+guard matches or beats base on creative/story prompts,"
	echo " matches base on factual/code prompts (drafts rejected → fallback to single decode)."
	echo " pld-raw (no guard) on repetitive prompts produces 'degraded' with high TG."