| | #!/bin/bash |
| | |
| | |
| |
|
| | set -e |
| |
|
| | echo "🚀 OpenAI Batch API 批量生成编程问题" |
| | echo "========================================" |
| | echo "预算: \$40" |
| | echo "模型: gpt-5-nano (Batch API - 50% off) - 最便宜的选项" |
| | echo "预计可生成: ~160,000+ 个样本" |
| | echo "========================================" |
| | echo "" |
| |
|
| | |
| | BUDGET=40 |
| | MIN_SCORE=60 |
| | MODEL="gpt-5-nano" |
| | INPUT_FILE="function_dataset_v2.csv" |
| | BATCH_REQUESTS_FILE="batch_requests_full.jsonl" |
| | BATCH_RESULTS_RAW="batch_results_raw.jsonl" |
| | FINAL_OUTPUT="programming_problems_batch.jsonl" |
| | BATCH_ID_FILE="batch_id.txt" |
| |
|
| | |
| | if [ ! -f "$INPUT_FILE" ]; then |
| | echo "❌ 错误: 找不到输入文件 $INPUT_FILE" |
| | exit 1 |
| | fi |
| |
|
| | if [ -z "$OPENAI_API_KEY" ]; then |
| | echo "❌ 错误: OPENAI_API_KEY 环境变量未设置" |
| | echo " 请运行: export OPENAI_API_KEY='your-api-key'" |
| | exit 1 |
| | fi |
| |
|
| | |
| | echo "📊 步骤 1/5: 估算预算..." |
| | echo "----------------------------------------" |
| | python3 generate_problems_batch.py estimate \ |
| | --num-requests 44000 \ |
| | --avg-input-tokens 1917 \ |
| | --avg-output-tokens 2552 \ |
| | --model $MODEL |
| |
|
| | echo "" |
| | read -p "👉 继续执行? (y/n) " -n 1 -r |
| | echo "" |
| | if [[ ! $REPLY =~ ^[Yy]$ ]]; then |
| | echo "❌ 已取消" |
| | exit 0 |
| | fi |
| |
|
| | |
| | echo "" |
| | echo "📋 步骤 2/5: 准备批量请求..." |
| | echo "----------------------------------------" |
| | python3 generate_problems_batch.py prepare \ |
| | --input $INPUT_FILE \ |
| | --output $BATCH_REQUESTS_FILE \ |
| | --min-score $MIN_SCORE \ |
| | --model $MODEL |
| |
|
| | |
| | REQUEST_COUNT=$(wc -l < $BATCH_REQUESTS_FILE) |
| | echo "✅ 已准备 $REQUEST_COUNT 个请求" |
| |
|
| | |
| | echo "" |
| | echo "💰 根据实际请求数量重新估算..." |
| | python3 generate_problems_batch.py estimate \ |
| | --num-requests $REQUEST_COUNT \ |
| | --avg-input-tokens 1917 \ |
| | --avg-output-tokens 2552 \ |
| | --model $MODEL |
| |
|
| | echo "" |
| | read -p "👉 继续提交到 OpenAI? (y/n) " -n 1 -r |
| | echo "" |
| | if [[ ! $REPLY =~ ^[Yy]$ ]]; then |
| | echo "❌ 已取消 (批量请求文件已保存: $BATCH_REQUESTS_FILE)" |
| | exit 0 |
| | fi |
| |
|
| | |
| | echo "" |
| | echo "🚀 步骤 3/5: 提交批处理任务到 OpenAI..." |
| | echo "----------------------------------------" |
| | SUBMIT_OUTPUT=$(python3 generate_problems_batch.py submit \ |
| | --input $BATCH_REQUESTS_FILE \ |
| | --model $MODEL \ |
| | --description "Scientific computing problems - $REQUEST_COUNT samples") |
| |
|
| | echo "$SUBMIT_OUTPUT" |
| |
|
| | |
| | BATCH_ID=$(echo "$SUBMIT_OUTPUT" | grep -oP 'Batch created: \K[^ ]+' || echo "$SUBMIT_OUTPUT" | grep -oP 'batch_[a-zA-Z0-9_]+' | head -1) |
| |
|
| | if [ -z "$BATCH_ID" ]; then |
| | echo "❌ 错误: 无法获取 Batch ID" |
| | echo "请手动检查输出并记录 Batch ID" |
| | exit 1 |
| | fi |
| |
|
| | echo "$BATCH_ID" > $BATCH_ID_FILE |
| | echo "" |
| | echo "✅ Batch ID 已保存到: $BATCH_ID_FILE" |
| | echo "📝 Batch ID: $BATCH_ID" |
| | echo "" |
| |
|
| | |
| | echo "⏳ 步骤 4/5: 监控批处理状态..." |
| | echo "----------------------------------------" |
| | echo "批处理任务通常在几小时内完成(最多24小时)" |
| | echo "您可以:" |
| | echo " 1. 等待脚本自动监控(每5分钟检查一次)" |
| | echo " 2. 按 Ctrl+C 退出,稍后运行监控命令:" |
| | echo " python3 generate_problems_batch.py status $BATCH_ID" |
| | echo "" |
| |
|
| | read -p "👉 是否自动监控? (y/n) " -n 1 -r |
| | echo "" |
| |
|
| | if [[ $REPLY =~ ^[Yy]$ ]]; then |
| | echo "🔍 开始自动监控..." |
| | |
| | while true; do |
| | TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') |
| | echo "" |
| | echo "[$TIMESTAMP] 检查批处理状态..." |
| | |
| | STATUS_OUTPUT=$(python3 generate_problems_batch.py status $BATCH_ID) |
| | echo "$STATUS_OUTPUT" |
| | |
| | |
| | if echo "$STATUS_OUTPUT" | grep -q "Status: completed"; then |
| | echo "" |
| | echo "✅ 批处理已完成!" |
| | break |
| | elif echo "$STATUS_OUTPUT" | grep -q "Status: failed"; then |
| | echo "" |
| | echo "❌ 批处理失败!请检查错误信息" |
| | exit 1 |
| | elif echo "$STATUS_OUTPUT" | grep -q "Status: expired"; then |
| | echo "" |
| | echo "❌ 批处理已过期(超过24小时)" |
| | exit 1 |
| | fi |
| | |
| | echo "⏳ 批处理仍在进行中,5分钟后再次检查..." |
| | sleep 300 |
| | done |
| | else |
| | echo "ℹ️ 跳过自动监控" |
| | echo "稍后请手动检查状态:" |
| | echo " python3 generate_problems_batch.py status $BATCH_ID" |
| | echo "" |
| | echo "完成后运行下载和处理命令:" |
| | echo " python3 generate_problems_batch.py download $BATCH_ID --output $BATCH_RESULTS_RAW" |
| | echo " python3 generate_problems_batch.py process --input $BATCH_RESULTS_RAW --output $FINAL_OUTPUT" |
| | exit 0 |
| | fi |
| |
|
| | |
| | echo "" |
| | echo "⬇️ 步骤 5/5: 下载和处理结果..." |
| | echo "----------------------------------------" |
| |
|
| | |
| | python3 generate_problems_batch.py download $BATCH_ID \ |
| | --output $BATCH_RESULTS_RAW |
| |
|
| | |
| | python3 generate_problems_batch.py process \ |
| | --input $BATCH_RESULTS_RAW \ |
| | --output $FINAL_OUTPUT \ |
| | --model $MODEL \ |
| | --requests $BATCH_REQUESTS_FILE |
| |
|
| | echo "" |
| | echo "========================================" |
| | echo "✅ 全部完成!" |
| | echo "========================================" |
| | echo "最终结果文件: $FINAL_OUTPUT" |
| | echo "" |
| | echo "查看结果:" |
| | echo " head -1 $FINAL_OUTPUT | python3 -m json.tool" |
| | echo " wc -l $FINAL_OUTPUT" |
| | echo "" |
| | echo "Batch ID: $BATCH_ID (已保存在 $BATCH_ID_FILE)" |
| | echo "========================================" |
| |
|