Spaces:
Sleeping
Sleeping
File size: 1,623 Bytes
2ada650 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
#!/bin/bash
# Define common arguments for all scripts
PRED="pred_path"
OUTPUT_DIR="output_dir"
API_KEY="api_key"
NUM_TASKS=128
# Run the "correctness" evaluation script
python evaluate_benchmark_1_correctness.py \
--pred_path "${PRED_GENERIC}" \
--output_dir "${OUTPUT_DIR}/correctness_eval" \
--output_json "${OUTPUT_DIR}/correctness_results.json" \
--api_key $API_KEY \
--num_tasks $NUM_TASKS
# Run the "detailed orientation" evaluation script
python evaluate_benchmark_2_detailed_orientation.py \
--pred_path "${PRED_GENERIC}" \
--output_dir "${OUTPUT_DIR}/detailed_eval" \
--output_json "${OUTPUT_DIR}/detailed_orientation_results.json" \
--api_key $API_KEY \
--num_tasks $NUM_TASKS
# Run the "contextual understanding" evaluation script
python evaluate_benchmark_3_context.py \
--pred_path "${PRED_GENERIC}" \
--output_dir "${OUTPUT_DIR}/context_eval" \
--output_json "${OUTPUT_DIR}/contextual_understanding_results.json" \
--api_key $API_KEY \
--num_tasks $NUM_TASKS
# Run the "temporal understanding" evaluation script
python evaluate_benchmark_4_temporal.py \
--pred_path "${PRED_TEMPORAL}" \
--output_dir "${OUTPUT_DIR}/temporal_eval" \
--output_json "${OUTPUT_DIR}/temporal_understanding_results.json" \
--api_key $API_KEY \
--num_tasks $NUM_TASKS
# Run the "consistency" evaluation script
python evaluate_benchmark_5_consistency.py \
--pred_path "${PRED_CONSISTENCY}" \
--output_dir "${OUTPUT_DIR}/consistency_eval" \
--output_json "${OUTPUT_DIR}/consistency_results.json" \
--api_key $API_KEY \
--num_tasks $NUM_TASKS
echo "All evaluations completed!"
|