Stack-2-9-finetuned / stack /eval /quick_human_eval.sh
walidsobhie-code
refactor: Squeeze folders further - cleaner structure
65888d5
#!/bin/bash
# Stack 2.9 Quick HumanEval Evaluation Wrapper
# Usage: ./quick_human_eval.sh [provider] [model] [num_samples]
# Example: ./quick_human_eval.sh ollama qwen2.5-coder:32b 20
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# Defaults
PROVIDER="${1:-ollama}"
MODEL="${2:-qwen2.5-coder:32b}"
MAX_PROBLEMS="${3:-20}"
echo "========================================"
echo "Stack 2.9 HumanEval Quick Evaluation"
echo "========================================"
echo "Provider: $PROVIDER"
echo "Model: $MODEL"
echo "Problems: $MAX_PROBLEMS"
echo ""
# Check if vllm is available
if command -v vllm &> /dev/null; then
USE_VLLM="--use-vllm"
echo "βœ“ vLLM detected - will use for faster inference"
else
USE_VLLM=""
echo "⚠ vLLM not found - using standard inference"
fi
# Check provider availability
case "$PROVIDER" in
ollama)
if command -v ollama &> /dev/null; then
echo "βœ“ Ollama available"
# Check if model is loaded
if curl -s http://localhost:11434/api/tags &> /dev/null; then
echo "βœ“ Ollama server running"
else
echo "⚠ Ollama server not running - start with: ollama serve"
fi
else
echo "⚠ Ollama not installed - will attempt anyway"
fi
;;
openai)
if [ -z "$OPENAI_API_KEY" ]; then
echo "⚠ OPENAI_API_KEY not set"
else
echo "βœ“ OpenAI API key configured"
fi
;;
anthropic)
if [ -z "$ANTHROPIC_API_KEY" ]; then
echo "⚠ ANTHROPIC_API_KEY not set"
else
echo "βœ“ Anthropic API key configured"
fi
;;
esac
echo ""
echo "Running evaluation..."
echo "----------------------------------------"
# Run the evaluation
python3 -m benchmarks.human_eval \
--provider "$PROVIDER" \
--model "$MODEL" \
--max-problems "$MAX_PROBLEMS" \
--timeout 30 \
$USE_VLLM
echo ""
echo "========================================"
echo "Evaluation complete!"
echo "========================================"
echo ""
echo "Results saved to: results/humaneval.json"
echo ""
echo "To run full 164-problem benchmark:"
echo " 1. Download full HumanEval dataset"
echo " 2. Use GPU with 80GB VRAM (A100/H100)"
echo " 3. See HUMAN_EVAL_PLAN.md for details"