Upload 45 files
Browse files- README.md +215 -3
- scripts/__pycache__/run_ablation_suite.cpython-311.pyc +0 -0
- scripts/augment_memory_from_standalone_runs.py +16 -0
- scripts/build_debate_memory.py +17 -0
- scripts/build_debug_memory.py +17 -0
- scripts/build_memory_assets.sh +56 -0
- scripts/build_memory_from_eval_results.py +17 -0
- scripts/execute.py +18 -0
- scripts/generate_with_memory.py +17 -0
- scripts/process_all_debate_cases.sh +64 -0
- scripts/run_ablation_suite.py +403 -0
- scripts/run_generate_and_evaluate.sh +640 -0
- scripts/run_memory_debate.py +17 -0
- scripts/test_self_healing_full.sh +92 -0
- src/debate_memory/__init__.py +11 -0
- src/debate_memory/__pycache__/__init__.cpython-310.pyc +0 -0
- src/debate_memory/__pycache__/__init__.cpython-311.pyc +0 -0
- src/debate_memory/__pycache__/build_memory_from_eval_results.cpython-311.pyc +0 -0
- src/debate_memory/__pycache__/config.cpython-310.pyc +0 -0
- src/debate_memory/__pycache__/config.cpython-311.pyc +0 -0
- src/debate_memory/__pycache__/debate_memory_builder.cpython-311.pyc +0 -0
- src/debate_memory/__pycache__/debug_executor.cpython-310.pyc +0 -0
- src/debate_memory/__pycache__/debug_memory.cpython-310.pyc +0 -0
- src/debate_memory/__pycache__/debug_memory_builder.cpython-311.pyc +0 -0
- src/debate_memory/__pycache__/generate_with_memory.cpython-310.pyc +0 -0
- src/debate_memory/__pycache__/generate_with_memory.cpython-311.pyc +0 -0
- src/debate_memory/__pycache__/llm.cpython-310.pyc +0 -0
- src/debate_memory/__pycache__/llm.cpython-311.pyc +0 -0
- src/debate_memory/__pycache__/memory_bank.cpython-310.pyc +0 -0
- src/debate_memory/__pycache__/memory_bank.cpython-311.pyc +0 -0
- src/debate_memory/__pycache__/run_memory_debate.cpython-311.pyc +0 -0
- src/debate_memory/augment_memory_from_standalone_runs.py +974 -0
- src/debate_memory/build_memory_from_eval_results.py +293 -0
- src/debate_memory/config.py +189 -0
- src/debate_memory/debate_memory_builder.py +477 -0
- src/debate_memory/debug_executor.py +136 -0
- src/debate_memory/debug_memory.py +163 -0
- src/debate_memory/debug_memory_builder.py +150 -0
- src/debate_memory/debug_utils.py +99 -0
- src/debate_memory/execute.py +522 -0
- src/debate_memory/generate_with_memory.py +920 -0
- src/debate_memory/llm.py +111 -0
- src/debate_memory/memory_bank.py +316 -0
- src/debate_memory/memory_intelligence.py +210 -0
- src/debate_memory/run_memory_debate.py +580 -0
README.md
CHANGED
|
@@ -1,3 +1,215 @@
|
|
| 1 |
-
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Agora-Opt Code Package
|
| 2 |
+
|
| 3 |
+
## What This Directory Contains
|
| 4 |
+
|
| 5 |
+
`./code/Agora-Opt/` is the source directory for the Agora-Opt method. It
|
| 6 |
+
retains two categories of assets:
|
| 7 |
+
|
| 8 |
+
- the Agora-Opt implementation
|
| 9 |
+
- prebuilt memory assets used by the method
|
| 10 |
+
|
| 11 |
+
Historical run outputs are not stored here.
|
| 12 |
+
|
| 13 |
+
For compatibility with the original stage naming, the main reproduction script
|
| 14 |
+
maintains two convenience paths:
|
| 15 |
+
|
| 16 |
+
- `generated_with_memory`
|
| 17 |
+
- `debate_runs`
|
| 18 |
+
|
| 19 |
+
## Important Subdirectories
|
| 20 |
+
|
| 21 |
+
The most important components are:
|
| 22 |
+
|
| 23 |
+
- `src/debate_memory/`: core Agora-Opt implementation
|
| 24 |
+
- `scripts/`: command-line wrappers
|
| 25 |
+
- `memory_storage/`: solution memory
|
| 26 |
+
- `debug_case_memory/`: debug memory retrieval bank
|
| 27 |
+
- `debate_memory_storage/`: debate memory retrieval bank
|
| 28 |
+
- `memory_variants/`: retained alternative memory variants
|
| 29 |
+
- `memory_backups/`: retained memory backups
|
| 30 |
+
|
| 31 |
+
Multiple memory versions are intentionally kept. They were prepared during
|
| 32 |
+
different stages of the project and can all be treated as available assets for
|
| 33 |
+
generation, debugging, and debate.
|
| 34 |
+
|
| 35 |
+
## Core Workflow
|
| 36 |
+
|
| 37 |
+
Agora-Opt runs in two stages.
|
| 38 |
+
|
| 39 |
+
### Stage 1: Generate Initial Solutions
|
| 40 |
+
|
| 41 |
+
`generate_with_memory.py` generates candidate solutions, optionally using
|
| 42 |
+
solution memory and debug memory.
|
| 43 |
+
|
| 44 |
+
Primary entry script:
|
| 45 |
+
|
| 46 |
+
- `scripts/generate_with_memory.py`
|
| 47 |
+
|
| 48 |
+
This stage:
|
| 49 |
+
|
| 50 |
+
- reads benchmark problems
|
| 51 |
+
- retrieves similar solved cases from `memory_storage/`
|
| 52 |
+
- generates candidate modeling code
|
| 53 |
+
- uses debug memory during self-repair when execution fails
|
| 54 |
+
|
| 55 |
+
### Stage 2: Run Debate
|
| 56 |
+
|
| 57 |
+
`run_memory_debate.py` takes two sets of initial solutions and runs the
|
| 58 |
+
decentralized debate stage.
|
| 59 |
+
|
| 60 |
+
Primary entry script:
|
| 61 |
+
|
| 62 |
+
- `scripts/run_memory_debate.py`
|
| 63 |
+
|
| 64 |
+
This stage:
|
| 65 |
+
|
| 66 |
+
- loads both sides' initial solutions
|
| 67 |
+
- retrieves historical debate cases from `debate_memory_storage/`
|
| 68 |
+
- performs iterative comparison, revision, and convergence
|
| 69 |
+
- executes and evaluates the final consensus solution
|
| 70 |
+
|
| 71 |
+
## Memory Types
|
| 72 |
+
|
| 73 |
+
### 1. Solution Memory
|
| 74 |
+
|
| 75 |
+
Directory:
|
| 76 |
+
|
| 77 |
+
- `memory_storage/`
|
| 78 |
+
|
| 79 |
+
Purpose:
|
| 80 |
+
|
| 81 |
+
- retrieves similar successful modeling cases during generation
|
| 82 |
+
- supplies formulation templates and structural priors
|
| 83 |
+
|
| 84 |
+
Build path:
|
| 85 |
+
|
| 86 |
+
- extract `(problem description, correct code, objective value)` from correctly
|
| 87 |
+
evaluated runs
|
| 88 |
+
- build `cases.jsonl` plus its retrieval index
|
| 89 |
+
|
| 90 |
+
Related script:
|
| 91 |
+
|
| 92 |
+
- `scripts/build_memory_from_eval_results.py`
|
| 93 |
+
|
| 94 |
+
### 2. Debug Memory
|
| 95 |
+
|
| 96 |
+
Directory:
|
| 97 |
+
|
| 98 |
+
- `debug_case_memory/`
|
| 99 |
+
|
| 100 |
+
Purpose:
|
| 101 |
+
|
| 102 |
+
- retrieves similar execution failures and repair experience
|
| 103 |
+
- supports automatic self-debugging during generation
|
| 104 |
+
|
| 105 |
+
Build path:
|
| 106 |
+
|
| 107 |
+
- extract unique error signatures from `debug_memory.jsonl` and its backups
|
| 108 |
+
- normalize the error text, repair hints, and metadata into a retrieval bank
|
| 109 |
+
|
| 110 |
+
Related script:
|
| 111 |
+
|
| 112 |
+
- `scripts/build_debug_memory.py`
|
| 113 |
+
|
| 114 |
+
Note:
|
| 115 |
+
|
| 116 |
+
- raw debug logs are stored in `memory_storage/debug_memory.jsonl`
|
| 117 |
+
- that log file is one of the inputs used to build debug memory
|
| 118 |
+
|
| 119 |
+
### 3. Debate Memory
|
| 120 |
+
|
| 121 |
+
Directory:
|
| 122 |
+
|
| 123 |
+
- `debate_memory_storage/`
|
| 124 |
+
|
| 125 |
+
Purpose:
|
| 126 |
+
|
| 127 |
+
- stores examples of how disagreements were resolved during debate
|
| 128 |
+
- helps later debates converge more efficiently
|
| 129 |
+
|
| 130 |
+
Build path:
|
| 131 |
+
|
| 132 |
+
- select historical runs where the two initial solutions disagreed
|
| 133 |
+
- keep cases where debate eventually converged successfully
|
| 134 |
+
- extract the dispute, key arguments, and final converged code
|
| 135 |
+
|
| 136 |
+
Related scripts:
|
| 137 |
+
|
| 138 |
+
- `scripts/build_debate_memory.py`
|
| 139 |
+
- `scripts/process_all_debate_cases.sh`
|
| 140 |
+
|
| 141 |
+
## Suggested Memory Construction Order
|
| 142 |
+
|
| 143 |
+
When preparing memory from scratch, the recommended order is:
|
| 144 |
+
|
| 145 |
+
1. run generation and evaluation to obtain `evaluation_results`
|
| 146 |
+
2. build solution memory from correct cases
|
| 147 |
+
3. build debug memory from accumulated `debug_memory.jsonl`
|
| 148 |
+
4. build debate memory from historical debate runs
|
| 149 |
+
|
| 150 |
+
The dependency flow is:
|
| 151 |
+
|
| 152 |
+
- `evaluation_results` -> `solution memory`
|
| 153 |
+
- `debug_memory.jsonl` -> `debug memory`
|
| 154 |
+
- debate run artifacts -> `debate memory`
|
| 155 |
+
|
| 156 |
+
## Retained Memory Assets
|
| 157 |
+
|
| 158 |
+
This directory intentionally keeps:
|
| 159 |
+
|
| 160 |
+
- the three primary memory stores
|
| 161 |
+
- memory variants
|
| 162 |
+
- memory backups
|
| 163 |
+
|
| 164 |
+
These are treated as static method assets.
|
| 165 |
+
|
| 166 |
+
Historical run outputs are not retained here, which keeps source code, memory
|
| 167 |
+
assets, and new results clearly separated.
|
| 168 |
+
|
| 169 |
+
To rebuild the three memory types, use:
|
| 170 |
+
|
| 171 |
+
```bash
|
| 172 |
+
bash ./code/Agora-Opt/scripts/build_memory_assets.sh /path/to/eval_dir1 /path/to/eval_dir2
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
That script attempts to:
|
| 176 |
+
|
| 177 |
+
- rebuild solution memory from evaluation directories
|
| 178 |
+
- rebuild debug memory from `debug_memory.jsonl` and its backups
|
| 179 |
+
- rebuild debate memory from debate run artifacts
|
| 180 |
+
|
| 181 |
+
## Recommended Entry Points
|
| 182 |
+
|
| 183 |
+
For paper reproduction, use the outer scripts rather than manually assembling
|
| 184 |
+
commands in this directory:
|
| 185 |
+
|
| 186 |
+
- main table: `./code/scripts/run_agora.sh`
|
| 187 |
+
- 5.1: `./code/experiments/5.1_compatibility_backbone_llms/`
|
| 188 |
+
- 5.2: `./code/experiments/5.2_ablation_study/`
|
| 189 |
+
- 5.3.1: `./code/experiments/5.3.1_centralized_judge_selection/`
|
| 190 |
+
- 5.3.2: `./code/experiments/5.3.2_impact_of_debate_rounds/`
|
| 191 |
+
- 5.3.3:
|
| 192 |
+
`./code/experiments/5.3.3_generalization_of_decentralized_debate_protocol/`
|
| 193 |
+
|
| 194 |
+
## Direct Source-Level Usage
|
| 195 |
+
|
| 196 |
+
For direct method-level use, the main wrappers are:
|
| 197 |
+
|
| 198 |
+
```bash
|
| 199 |
+
python scripts/generate_with_memory.py
|
| 200 |
+
python scripts/run_memory_debate.py
|
| 201 |
+
python scripts/execute.py
|
| 202 |
+
python scripts/build_memory_from_eval_results.py
|
| 203 |
+
python scripts/build_debug_memory.py
|
| 204 |
+
python scripts/build_debate_memory.py
|
| 205 |
+
```
|
| 206 |
+
|
| 207 |
+
## Path Conventions
|
| 208 |
+
|
| 209 |
+
Within the open-source package, the intended layout is:
|
| 210 |
+
|
| 211 |
+
- benchmark data: `./data/benchmarks/`
|
| 212 |
+
- Agora-Opt source code and memory: `./code/Agora-Opt/`
|
| 213 |
+
|
| 214 |
+
This separation makes the boundaries between code, memory assets, and newly
|
| 215 |
+
generated outputs explicit.
|
scripts/__pycache__/run_ablation_suite.cpython-311.pyc
ADDED
|
Binary file (19.3 kB). View file
|
|
|
scripts/augment_memory_from_standalone_runs.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Wrapper for debate_memory.augment_memory_from_standalone_runs."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 8 |
+
SRC_DIR = PROJECT_ROOT / "src"
|
| 9 |
+
if str(SRC_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 11 |
+
|
| 12 |
+
from debate_memory.augment_memory_from_standalone_runs import main
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
main()
|
scripts/build_debate_memory.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Wrapper for debate_memory.debate_memory_builder."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 8 |
+
SRC_DIR = PROJECT_ROOT / "src"
|
| 9 |
+
if str(SRC_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 11 |
+
|
| 12 |
+
from debate_memory.debate_memory_builder import main
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
main()
|
| 17 |
+
|
scripts/build_debug_memory.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Wrapper to consolidate debug_memory.jsonl entries into a memory bank."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 8 |
+
SRC_DIR = PROJECT_ROOT / "src"
|
| 9 |
+
if str(SRC_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 11 |
+
|
| 12 |
+
from debate_memory.debug_memory_builder import main
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
main()
|
| 17 |
+
|
scripts/build_memory_assets.sh
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
set -euo pipefail
|
| 4 |
+
|
| 5 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 6 |
+
AGORA_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 7 |
+
OPEN_ROOT="$(cd "${AGORA_DIR}/../.." && pwd)"
|
| 8 |
+
RESULTS_ROOT="${OPEN_ROOT}/results"
|
| 9 |
+
BENCHMARK_DIR="${OPEN_ROOT}/data/benchmarks"
|
| 10 |
+
PYTHON_BIN="${PYTHON_BIN:-python3}"
|
| 11 |
+
|
| 12 |
+
SOLUTION_MEMORY_DIR="${SOLUTION_MEMORY_DIR:-${AGORA_DIR}/memory_storage}"
|
| 13 |
+
DEBUG_CASE_MEMORY_DIR="${DEBUG_CASE_MEMORY_DIR:-${AGORA_DIR}/debug_case_memory}"
|
| 14 |
+
DEBATE_MEMORY_DIR="${DEBATE_MEMORY_DIR:-${AGORA_DIR}/debate_memory_storage}"
|
| 15 |
+
DEBATE_RUNS_ROOT="${DEBATE_RUNS_ROOT:-${RESULTS_ROOT}/Agora-Opt/debate}"
|
| 16 |
+
|
| 17 |
+
export PYTHONPATH="${AGORA_DIR}/src:${PYTHONPATH:-}"
|
| 18 |
+
|
| 19 |
+
echo "============================================================"
|
| 20 |
+
echo "Agora-Opt Memory Builder"
|
| 21 |
+
echo "============================================================"
|
| 22 |
+
echo "Solution memory: ${SOLUTION_MEMORY_DIR}"
|
| 23 |
+
echo "Debug memory: ${DEBUG_CASE_MEMORY_DIR}"
|
| 24 |
+
echo "Debate memory: ${DEBATE_MEMORY_DIR}"
|
| 25 |
+
echo "Debate runs: ${DEBATE_RUNS_ROOT}"
|
| 26 |
+
echo "============================================================"
|
| 27 |
+
echo
|
| 28 |
+
|
| 29 |
+
if [[ "$#" -gt 0 ]]; then
|
| 30 |
+
echo "Building solution memory from evaluation directories..."
|
| 31 |
+
"${PYTHON_BIN}" "${SCRIPT_DIR}/build_memory_from_eval_results.py" \
|
| 32 |
+
--eval_dirs "$@" \
|
| 33 |
+
--benchmarks_dir "${BENCHMARK_DIR}" \
|
| 34 |
+
--memory_dir "${SOLUTION_MEMORY_DIR}"
|
| 35 |
+
echo
|
| 36 |
+
else
|
| 37 |
+
echo "Skipping solution memory rebuild because no evaluation directories were provided."
|
| 38 |
+
echo "Usage example:"
|
| 39 |
+
echo " bash ./code/Agora-Opt/scripts/build_memory_assets.sh /path/to/eval_dir1 /path/to/eval_dir2"
|
| 40 |
+
echo
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
echo "Building debug memory..."
|
| 44 |
+
"${PYTHON_BIN}" "${SCRIPT_DIR}/build_debug_memory.py" \
|
| 45 |
+
--output_dir "${DEBUG_CASE_MEMORY_DIR}"
|
| 46 |
+
echo
|
| 47 |
+
|
| 48 |
+
if [[ -d "${DEBATE_RUNS_ROOT}" ]]; then
|
| 49 |
+
echo "Building debate memory..."
|
| 50 |
+
"${PYTHON_BIN}" "${SCRIPT_DIR}/build_debate_memory.py" \
|
| 51 |
+
--runs_root "${DEBATE_RUNS_ROOT}" \
|
| 52 |
+
--output_dir "${DEBATE_MEMORY_DIR}"
|
| 53 |
+
else
|
| 54 |
+
echo "Skipping debate memory rebuild because debate runs root does not exist:"
|
| 55 |
+
echo " ${DEBATE_RUNS_ROOT}"
|
| 56 |
+
fi
|
scripts/build_memory_from_eval_results.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Wrapper for debate_memory.build_memory_from_eval_results."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 8 |
+
SRC_DIR = PROJECT_ROOT / "src"
|
| 9 |
+
if str(SRC_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 11 |
+
|
| 12 |
+
from debate_memory.build_memory_from_eval_results import main
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
main()
|
| 17 |
+
|
scripts/execute.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Wrapper to run debate_memory.execute with package imports resolved."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 8 |
+
SRC_DIR = PROJECT_ROOT / "src"
|
| 9 |
+
if str(SRC_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 11 |
+
|
| 12 |
+
from debate_memory.execute import parse_args, main
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
args = parse_args()
|
| 17 |
+
main(args)
|
| 18 |
+
|
scripts/generate_with_memory.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Wrapper to run debate_memory.generate_with_memory as a script."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 8 |
+
SRC_DIR = PROJECT_ROOT / "src"
|
| 9 |
+
if str(SRC_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 11 |
+
|
| 12 |
+
from debate_memory.generate_with_memory import main
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
main()
|
| 17 |
+
|
scripts/process_all_debate_cases.sh
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Batch process every historical debate run and refresh the debate memory bank.
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# ./scripts/process_all_debate_cases.sh [runs_root] [output_dir]
|
| 7 |
+
# Example:
|
| 8 |
+
# ./scripts/process_all_debate_cases.sh \
|
| 9 |
+
# ../../results/Agora-Opt/debate \
|
| 10 |
+
# debate_memory_storage
|
| 11 |
+
#
|
| 12 |
+
# Environment variables (optional):
|
| 13 |
+
# LLM_MODEL - override default gpt-4o summarizer
|
| 14 |
+
# LLM_ATTEMPTS - retries per case (default 2)
|
| 15 |
+
# MAX_WORKERS - thread pool size (default 64)
|
| 16 |
+
# PYTHON_BIN - python executable (default python)
|
| 17 |
+
|
| 18 |
+
set -euo pipefail
|
| 19 |
+
|
| 20 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 21 |
+
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 22 |
+
DEFAULT_RUNS_ROOT="${PROJECT_ROOT}/../../results/Agora-Opt/debate"
|
| 23 |
+
|
| 24 |
+
RUNS_ROOT="${1:-$DEFAULT_RUNS_ROOT}"
|
| 25 |
+
OUTPUT_DIR="${2:-${PROJECT_ROOT}/debate_memory_storage}"
|
| 26 |
+
|
| 27 |
+
LLM_MODEL="${LLM_MODEL:-gpt-4o}"
|
| 28 |
+
LLM_ATTEMPTS="${LLM_ATTEMPTS:-2}"
|
| 29 |
+
MAX_WORKERS="${MAX_WORKERS:-64}"
|
| 30 |
+
PYTHON_BIN="${PYTHON_BIN:-python}"
|
| 31 |
+
|
| 32 |
+
echo "============================================================"
|
| 33 |
+
echo "🧠 Building Debate Memory"
|
| 34 |
+
echo "============================================================"
|
| 35 |
+
echo "Runs root: ${RUNS_ROOT}"
|
| 36 |
+
echo "Output dir: ${OUTPUT_DIR}"
|
| 37 |
+
echo "LLM model: ${LLM_MODEL:-<heuristic>}"
|
| 38 |
+
echo "LLM attempts: ${LLM_ATTEMPTS}"
|
| 39 |
+
echo "Max workers: ${MAX_WORKERS}"
|
| 40 |
+
echo "Python binary: ${PYTHON_BIN}"
|
| 41 |
+
echo "============================================================"
|
| 42 |
+
echo
|
| 43 |
+
|
| 44 |
+
CMD=(
|
| 45 |
+
"${PYTHON_BIN}"
|
| 46 |
+
"${PROJECT_ROOT}/scripts/build_debate_memory.py"
|
| 47 |
+
"--runs_root" "${RUNS_ROOT}"
|
| 48 |
+
"--output_dir" "${OUTPUT_DIR}"
|
| 49 |
+
"--max_workers" "${MAX_WORKERS}"
|
| 50 |
+
"--llm_attempts" "${LLM_ATTEMPTS}"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
if [ -n "${LLM_MODEL}" ]; then
|
| 54 |
+
CMD+=("--llm_model" "${LLM_MODEL}")
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
echo "Running: ${CMD[*]}"
|
| 58 |
+
echo
|
| 59 |
+
|
| 60 |
+
"${CMD[@]}"
|
| 61 |
+
|
| 62 |
+
echo
|
| 63 |
+
echo "✅ Debate memory refreshed."
|
| 64 |
+
echo "Cases stored in: ${OUTPUT_DIR}"
|
scripts/run_ablation_suite.py
ADDED
|
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Run a suite of ablation experiments (generation + evaluation) and summarise results.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import argparse
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import shlex
|
| 12 |
+
import subprocess
|
| 13 |
+
import sys
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Dict, List, Sequence, Tuple
|
| 18 |
+
|
| 19 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 20 |
+
STANDARD_RESULTS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt"
|
| 21 |
+
GENERATE_SCRIPT = PROJECT_ROOT / "scripts" / "generate_with_memory.py"
|
| 22 |
+
EXECUTE_SCRIPT = PROJECT_ROOT / "scripts" / "execute.py"
|
| 23 |
+
PYTHON_BIN = os.environ.get("PYTHON_BIN", sys.executable)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class Variant:
|
| 28 |
+
name: str
|
| 29 |
+
description: str
|
| 30 |
+
overrides: Dict[str, object]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def parse_args() -> argparse.Namespace:
|
| 34 |
+
parser = argparse.ArgumentParser(
|
| 35 |
+
description="Run generate+evaluate ablations and emit a summary table."
|
| 36 |
+
)
|
| 37 |
+
parser.add_argument("--model", type=str, default="gpt-4o", help="LLM to query.")
|
| 38 |
+
parser.add_argument(
|
| 39 |
+
"--datasets",
|
| 40 |
+
nargs="+",
|
| 41 |
+
default=["IndustryOR", "ComplexLP"],
|
| 42 |
+
help="Datasets to evaluate (space-separated, omit .jsonl).",
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument("--temperature", type=float, default=0.01)
|
| 45 |
+
parser.add_argument(
|
| 46 |
+
"--max_problems",
|
| 47 |
+
type=int,
|
| 48 |
+
default=None,
|
| 49 |
+
help="Limit number of problems per dataset (omit for full set).",
|
| 50 |
+
)
|
| 51 |
+
parser.add_argument("--memory_dir", type=str, default="memory_storage")
|
| 52 |
+
parser.add_argument(
|
| 53 |
+
"--memory_top_k",
|
| 54 |
+
type=int,
|
| 55 |
+
default=3,
|
| 56 |
+
help="Base episodic memory retrieval count for the full variant.",
|
| 57 |
+
)
|
| 58 |
+
parser.add_argument(
|
| 59 |
+
"--max_retries",
|
| 60 |
+
type=int,
|
| 61 |
+
default=5,
|
| 62 |
+
help="Base retry budget for the full variant.",
|
| 63 |
+
)
|
| 64 |
+
parser.add_argument(
|
| 65 |
+
"--debug_case_top_k",
|
| 66 |
+
type=int,
|
| 67 |
+
default=3,
|
| 68 |
+
help="Base debug-case retrieval count.",
|
| 69 |
+
)
|
| 70 |
+
parser.add_argument(
|
| 71 |
+
"--parallel",
|
| 72 |
+
type=int,
|
| 73 |
+
default=64,
|
| 74 |
+
help="Workers for generation (passed to --parallel).",
|
| 75 |
+
)
|
| 76 |
+
parser.add_argument(
|
| 77 |
+
"--execution_timeout",
|
| 78 |
+
type=int,
|
| 79 |
+
default=90,
|
| 80 |
+
help="Timeout per execution attempt in generate_with_memory.",
|
| 81 |
+
)
|
| 82 |
+
parser.add_argument(
|
| 83 |
+
"--debug_memory_path",
|
| 84 |
+
type=str,
|
| 85 |
+
default="memory_storage/debug_memory.jsonl",
|
| 86 |
+
help="Path to debug memory JSONL.",
|
| 87 |
+
)
|
| 88 |
+
parser.add_argument(
|
| 89 |
+
"--debug_case_dir",
|
| 90 |
+
type=str,
|
| 91 |
+
default="debug_case_memory",
|
| 92 |
+
help="Directory containing consolidated debug-case memory.",
|
| 93 |
+
)
|
| 94 |
+
parser.add_argument(
|
| 95 |
+
"--output_root",
|
| 96 |
+
type=str,
|
| 97 |
+
default=str(STANDARD_RESULTS_ROOT / "ablations"),
|
| 98 |
+
help="Root folder for storing ablation artefacts.",
|
| 99 |
+
)
|
| 100 |
+
parser.add_argument(
|
| 101 |
+
"--eval_timeout",
|
| 102 |
+
type=int,
|
| 103 |
+
default=90,
|
| 104 |
+
help="Timeout for scripts/execute.py.",
|
| 105 |
+
)
|
| 106 |
+
parser.add_argument(
|
| 107 |
+
"--num_workers",
|
| 108 |
+
type=int,
|
| 109 |
+
default=64,
|
| 110 |
+
help="ProcessPool workers for evaluation.",
|
| 111 |
+
)
|
| 112 |
+
parser.add_argument("--tolerance", type=float, default=0.05)
|
| 113 |
+
parser.add_argument(
|
| 114 |
+
"--relative_tolerance",
|
| 115 |
+
action="store_true",
|
| 116 |
+
help="Use relative tolerance in evaluation.",
|
| 117 |
+
)
|
| 118 |
+
parser.add_argument(
|
| 119 |
+
"--dry_run",
|
| 120 |
+
action="store_true",
|
| 121 |
+
help="Print commands without executing or aggregating results.",
|
| 122 |
+
)
|
| 123 |
+
return parser.parse_args()
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def build_variants(args: argparse.Namespace) -> List[Variant]:
|
| 127 |
+
base = {
|
| 128 |
+
"memory_top_k": args.memory_top_k,
|
| 129 |
+
"use_llm_refinement": True,
|
| 130 |
+
"debug_case_memory_top_k": args.debug_case_top_k,
|
| 131 |
+
"max_retries": args.max_retries,
|
| 132 |
+
"auto_debug": True,
|
| 133 |
+
}
|
| 134 |
+
return [
|
| 135 |
+
Variant(
|
| 136 |
+
name="full_system",
|
| 137 |
+
description="All helpers enabled (reference).",
|
| 138 |
+
overrides={**base},
|
| 139 |
+
),
|
| 140 |
+
Variant(
|
| 141 |
+
name="no_llm_refine",
|
| 142 |
+
description="Skip LLM summarisation of retrieved cases.",
|
| 143 |
+
overrides={**base, "use_llm_refinement": False},
|
| 144 |
+
),
|
| 145 |
+
Variant(
|
| 146 |
+
name="no_debug_case_memory",
|
| 147 |
+
description="Disable historical debug-case retrieval.",
|
| 148 |
+
overrides={**base, "debug_case_memory_top_k": 0},
|
| 149 |
+
),
|
| 150 |
+
Variant(
|
| 151 |
+
name="no_self_healing",
|
| 152 |
+
description="Single attempt (max_retries=1) but still executes locally once.",
|
| 153 |
+
overrides={**base, "max_retries": 1},
|
| 154 |
+
),
|
| 155 |
+
Variant(
|
| 156 |
+
name="no_memory",
|
| 157 |
+
description="Disable episodic retrieval, keep retries on.",
|
| 158 |
+
overrides={**base, "memory_top_k": 0, "use_llm_refinement": False},
|
| 159 |
+
),
|
| 160 |
+
Variant(
|
| 161 |
+
name="vanilla_llm",
|
| 162 |
+
description="Pure single-shot LLM (no memory, no auto-debug).",
|
| 163 |
+
overrides={
|
| 164 |
+
**base,
|
| 165 |
+
"memory_top_k": 0,
|
| 166 |
+
"use_llm_refinement": False,
|
| 167 |
+
"debug_case_memory_top_k": 0,
|
| 168 |
+
"max_retries": 1,
|
| 169 |
+
"auto_debug": False,
|
| 170 |
+
},
|
| 171 |
+
),
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def run_command(cmd: Sequence[str], dry_run: bool = False) -> None:
|
| 176 |
+
pretty = " ".join(shlex.quote(part) for part in cmd)
|
| 177 |
+
print(f" → {pretty}")
|
| 178 |
+
if dry_run:
|
| 179 |
+
return
|
| 180 |
+
subprocess.run(cmd, check=True)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def compute_attempt_stats(path: Path) -> Tuple[float, int]:
|
| 184 |
+
if not path.exists():
|
| 185 |
+
return 0.0, 0
|
| 186 |
+
total = 0
|
| 187 |
+
total_attempts = 0
|
| 188 |
+
multi_attempt = 0
|
| 189 |
+
with path.open("r", encoding="utf-8") as handle:
|
| 190 |
+
for line in handle:
|
| 191 |
+
line = line.strip()
|
| 192 |
+
if not line:
|
| 193 |
+
continue
|
| 194 |
+
record = json.loads(line)
|
| 195 |
+
attempts = record.get("total_attempts", 1)
|
| 196 |
+
total_attempts += attempts
|
| 197 |
+
total += 1
|
| 198 |
+
if attempts > 1:
|
| 199 |
+
multi_attempt += 1
|
| 200 |
+
avg = (total_attempts / total) if total else 0.0
|
| 201 |
+
return avg, multi_attempt
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def format_percent(value: float) -> str:
|
| 205 |
+
return f"{value * 100:.1f}%"
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def build_generate_args(
|
| 209 |
+
dataset: str,
|
| 210 |
+
output_file: Path,
|
| 211 |
+
debug_dir: Path,
|
| 212 |
+
args: argparse.Namespace,
|
| 213 |
+
cfg: Dict[str, object],
|
| 214 |
+
) -> List[str]:
|
| 215 |
+
cmd = [
|
| 216 |
+
os.fspath(GENERATE_SCRIPT),
|
| 217 |
+
"--dataset",
|
| 218 |
+
dataset,
|
| 219 |
+
"--model",
|
| 220 |
+
args.model,
|
| 221 |
+
"--temperature",
|
| 222 |
+
str(args.temperature),
|
| 223 |
+
"--output",
|
| 224 |
+
os.fspath(output_file),
|
| 225 |
+
"--memory_dir",
|
| 226 |
+
os.fspath(Path(args.memory_dir).resolve()),
|
| 227 |
+
"--parallel",
|
| 228 |
+
str(args.parallel),
|
| 229 |
+
"--execution_timeout",
|
| 230 |
+
str(args.execution_timeout),
|
| 231 |
+
"--debug_memory_path",
|
| 232 |
+
os.fspath(Path(args.debug_memory_path).resolve()),
|
| 233 |
+
"--debug_case_memory_dir",
|
| 234 |
+
os.fspath(Path(args.debug_case_dir).resolve()),
|
| 235 |
+
"--debug_case_memory_top_k",
|
| 236 |
+
str(int(cfg.get("debug_case_memory_top_k", 0))),
|
| 237 |
+
"--memory_top_k",
|
| 238 |
+
str(int(cfg.get("memory_top_k", 0))),
|
| 239 |
+
"--max_retries",
|
| 240 |
+
str(int(cfg.get("max_retries", 1))),
|
| 241 |
+
]
|
| 242 |
+
if args.max_problems:
|
| 243 |
+
cmd += ["--max_problems", str(args.max_problems)]
|
| 244 |
+
if cfg.get("use_llm_refinement"):
|
| 245 |
+
cmd.append("--use_llm_refinement")
|
| 246 |
+
if not cfg.get("filter_perfect", True):
|
| 247 |
+
cmd.append("--no_filter_perfect")
|
| 248 |
+
if not cfg.get("auto_debug", True):
|
| 249 |
+
cmd.append("--no_auto_debug")
|
| 250 |
+
if debug_dir:
|
| 251 |
+
cmd += ["--debug_output_dir", os.fspath(debug_dir)]
|
| 252 |
+
return [os.fspath(part) for part in cmd]
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def build_execute_args(input_file: Path, output_dir: Path, args: argparse.Namespace) -> List[str]:
|
| 256 |
+
cmd = [
|
| 257 |
+
os.fspath(EXECUTE_SCRIPT),
|
| 258 |
+
"--input_file",
|
| 259 |
+
os.fspath(input_file),
|
| 260 |
+
"--output_dir",
|
| 261 |
+
os.fspath(output_dir),
|
| 262 |
+
"--timeout",
|
| 263 |
+
str(args.eval_timeout),
|
| 264 |
+
"--tolerance",
|
| 265 |
+
str(args.tolerance),
|
| 266 |
+
"--num_workers",
|
| 267 |
+
str(args.num_workers),
|
| 268 |
+
"--memory_dir",
|
| 269 |
+
os.fspath(Path(args.memory_dir).resolve()),
|
| 270 |
+
"--debug_memory_path",
|
| 271 |
+
os.fspath(Path(args.debug_memory_path).resolve()),
|
| 272 |
+
]
|
| 273 |
+
if args.relative_tolerance:
|
| 274 |
+
cmd.append("--use_relative_tolerance")
|
| 275 |
+
return cmd
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def summarise_records(records: List[Dict], summary_path: Path) -> None:
|
| 279 |
+
if not records:
|
| 280 |
+
return
|
| 281 |
+
md_lines = [
|
| 282 |
+
"| Dataset | Variant | Accuracy | Correct/Total | Exec Err % | Timeout % | No-Code % | Avg Attempts | Notes |",
|
| 283 |
+
"| --- | --- | --- | --- | --- | --- | --- | --- | --- |",
|
| 284 |
+
]
|
| 285 |
+
csv_lines = [
|
| 286 |
+
"dataset,variant,accuracy,correct,total,exec_error_pct,timeout_pct,no_code_pct,avg_attempts,notes"
|
| 287 |
+
]
|
| 288 |
+
for record in records:
|
| 289 |
+
dataset = record["dataset"]
|
| 290 |
+
variant = record["variant"]
|
| 291 |
+
report = record["report"]
|
| 292 |
+
status_counts = report.get("status_counts", {})
|
| 293 |
+
total = report.get("total_problems", 0)
|
| 294 |
+
accuracy_pct = format_percent(report.get("accuracy", 0.0))
|
| 295 |
+
correct = report.get("correct", 0)
|
| 296 |
+
exec_err_pct = (
|
| 297 |
+
(status_counts.get("execution_error", 0) / total) if total else 0.0
|
| 298 |
+
)
|
| 299 |
+
timeout_pct = (status_counts.get("timeout", 0) / total) if total else 0.0
|
| 300 |
+
no_code_pct = (status_counts.get("no_code", 0) / total) if total else 0.0
|
| 301 |
+
avg_attempts = record.get("avg_attempts", 0.0)
|
| 302 |
+
notes = record["notes"]
|
| 303 |
+
md_lines.append(
|
| 304 |
+
f"| {dataset} | {variant} | {accuracy_pct} | {correct}/{total} | "
|
| 305 |
+
f"{exec_err_pct*100:.1f}% | {timeout_pct*100:.1f}% | {no_code_pct*100:.1f}% | "
|
| 306 |
+
f"{avg_attempts:.2f} | {notes} |"
|
| 307 |
+
)
|
| 308 |
+
safe_notes = notes.replace('"', '""')
|
| 309 |
+
csv_lines.append(
|
| 310 |
+
f"{dataset},{variant},{report.get('accuracy',0.0):.4f},{correct},{total},"
|
| 311 |
+
f"{exec_err_pct:.4f},{timeout_pct:.4f},{no_code_pct:.4f},{avg_attempts:.4f},\"{safe_notes}\""
|
| 312 |
+
)
|
| 313 |
+
summary_path.write_text("\n".join(md_lines) + "\n", encoding="utf-8")
|
| 314 |
+
csv_path = summary_path.with_suffix(".csv")
|
| 315 |
+
csv_path.write_text("\n".join(csv_lines) + "\n", encoding="utf-8")
|
| 316 |
+
print(f"\n✅ Summary table written to: {summary_path}")
|
| 317 |
+
print(f"📄 CSV export written to: {csv_path}")
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def main() -> None:
|
| 321 |
+
args = parse_args()
|
| 322 |
+
variants = build_variants(args)
|
| 323 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 324 |
+
run_root = Path(args.output_root).resolve() / timestamp
|
| 325 |
+
if not args.dry_run:
|
| 326 |
+
run_root.mkdir(parents=True, exist_ok=True)
|
| 327 |
+
|
| 328 |
+
print("========================================")
|
| 329 |
+
print("Ablation Runner")
|
| 330 |
+
print("========================================")
|
| 331 |
+
print(f"Model: {args.model}")
|
| 332 |
+
print(f"Datasets: {', '.join(args.datasets)}")
|
| 333 |
+
print(f"Output root: {run_root if not args.dry_run else args.output_root}")
|
| 334 |
+
print(f"Dry run: {args.dry_run}")
|
| 335 |
+
print("========================================\n")
|
| 336 |
+
|
| 337 |
+
records: List[Dict] = []
|
| 338 |
+
for dataset in args.datasets:
|
| 339 |
+
print(f"Dataset: {dataset}")
|
| 340 |
+
for variant in variants:
|
| 341 |
+
cfg = variant.overrides
|
| 342 |
+
variant_name = variant.name
|
| 343 |
+
print(f" Variant: {variant_name} – {variant.description}")
|
| 344 |
+
dataset_slug = dataset.replace("/", "_")
|
| 345 |
+
gen_output = (
|
| 346 |
+
run_root / f"{dataset_slug}_{variant_name}.jsonl"
|
| 347 |
+
if not args.dry_run
|
| 348 |
+
else Path(f"{dataset_slug}_{variant_name}.jsonl")
|
| 349 |
+
)
|
| 350 |
+
debug_dir = (
|
| 351 |
+
run_root / "debug" / dataset_slug / variant_name
|
| 352 |
+
if not args.dry_run
|
| 353 |
+
else Path(f"debug/{dataset_slug}/{variant_name}")
|
| 354 |
+
)
|
| 355 |
+
eval_dir = (
|
| 356 |
+
run_root / f"{dataset_slug}_{variant_name}_eval"
|
| 357 |
+
if not args.dry_run
|
| 358 |
+
else Path(f"{dataset_slug}_{variant_name}_eval")
|
| 359 |
+
)
|
| 360 |
+
if not args.dry_run:
|
| 361 |
+
debug_dir.mkdir(parents=True, exist_ok=True)
|
| 362 |
+
gen_cmd = [PYTHON_BIN] + build_generate_args(
|
| 363 |
+
dataset, gen_output, debug_dir, args, cfg
|
| 364 |
+
)
|
| 365 |
+
run_command(gen_cmd, dry_run=args.dry_run)
|
| 366 |
+
|
| 367 |
+
exec_cmd = [
|
| 368 |
+
PYTHON_BIN,
|
| 369 |
+
] + build_execute_args(gen_output, eval_dir, args)
|
| 370 |
+
run_command(exec_cmd, dry_run=args.dry_run)
|
| 371 |
+
|
| 372 |
+
if args.dry_run:
|
| 373 |
+
continue
|
| 374 |
+
|
| 375 |
+
report_path = eval_dir / "evaluation_report.json"
|
| 376 |
+
if not report_path.exists():
|
| 377 |
+
raise FileNotFoundError(
|
| 378 |
+
f"Missing evaluation report for {dataset} / {variant_name}: {report_path}"
|
| 379 |
+
)
|
| 380 |
+
with report_path.open("r", encoding="utf-8") as handle:
|
| 381 |
+
report = json.load(handle)
|
| 382 |
+
avg_attempts, _ = compute_attempt_stats(gen_output)
|
| 383 |
+
records.append(
|
| 384 |
+
{
|
| 385 |
+
"dataset": dataset,
|
| 386 |
+
"variant": variant_name,
|
| 387 |
+
"report": report,
|
| 388 |
+
"avg_attempts": avg_attempts,
|
| 389 |
+
"notes": variant.description,
|
| 390 |
+
}
|
| 391 |
+
)
|
| 392 |
+
print("")
|
| 393 |
+
|
| 394 |
+
if args.dry_run:
|
| 395 |
+
print("Dry run completed. No commands were executed.")
|
| 396 |
+
return
|
| 397 |
+
|
| 398 |
+
summary_path = run_root / "ablation_summary.md"
|
| 399 |
+
summarise_records(records, summary_path)
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
if __name__ == "__main__":
|
| 403 |
+
main()
|
scripts/run_generate_and_evaluate.sh
ADDED
|
@@ -0,0 +1,640 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
set -uo pipefail
|
| 4 |
+
|
| 5 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 6 |
+
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 7 |
+
OPEN_ROOT="$(cd "${PROJECT_ROOT}/../.." && pwd)"
|
| 8 |
+
SRC_DIR="${PROJECT_ROOT}/src"
|
| 9 |
+
export PYTHONPATH="${SRC_DIR}:${PYTHONPATH:-}"
|
| 10 |
+
|
| 11 |
+
# Generate and Evaluate - Combined pipeline for generation + evaluation
|
| 12 |
+
# Usage: ./run_generate_and_evaluate.sh [model_name] [max_problems] [num_workers] [timeout] [tolerance] [dataset_name]
|
| 13 |
+
#
|
| 14 |
+
# Environment Variables:
|
| 15 |
+
# REFRESH_DEBUG_MEMORY - Set to "false" to disable auto-backup and clearing of debug memory (default: true)
|
| 16 |
+
# RUN_ALL_BENCHMARKS - Set to "true" to run all benchmarks in ./data/benchmarks/ (default: true)
|
| 17 |
+
# USE_HF_OFFLINE - Set to "false" to allow downloading models from Hugging Face (default: true)
|
| 18 |
+
# PARALLEL_BENCHMARKS - Set to "true" to run benchmarks in parallel (default: true)
|
| 19 |
+
# MAX_PARALLEL_JOBS - Maximum number of parallel jobs (default: 4)
|
| 20 |
+
# DATASET_NAME - Dataset to run when RUN_ALL_BENCHMARKS=false (default: IndustryOR)
|
| 21 |
+
# EMBEDDING_MODEL - Optional embedding model name or local path passed to memory retrieval
|
| 22 |
+
#
|
| 23 |
+
# Example:
|
| 24 |
+
# ./run_generate_and_evaluate.sh # Run with default settings (all benchmarks, offline mode, parallel)
|
| 25 |
+
# RUN_ALL_BENCHMARKS=false ./run_generate_and_evaluate.sh # Run single dataset
|
| 26 |
+
# RUN_ALL_BENCHMARKS=false ./run_generate_and_evaluate.sh gpt-4o 100 64 90 0.05 OPT-Principled
|
| 27 |
+
# USE_HF_OFFLINE=false ./run_generate_and_evaluate.sh # Allow downloading models
|
| 28 |
+
# REFRESH_DEBUG_MEMORY=false ./run_generate_and_evaluate.sh # Run without refreshing debug memory
|
| 29 |
+
# PARALLEL_BENCHMARKS=false ./run_generate_and_evaluate.sh # Run sequentially
|
| 30 |
+
# MAX_PARALLEL_JOBS=2 ./run_generate_and_evaluate.sh # Limit to 2 parallel jobs
|
| 31 |
+
|
| 32 |
+
MODEL=${1:-"gpt-4o"}
|
| 33 |
+
MAX_PROBLEMS=${2:-1000}
|
| 34 |
+
NUM_WORKERS=${3:-100}
|
| 35 |
+
TIMEOUT=${4:-60}
|
| 36 |
+
TOLERANCE=${5:-0.05}
|
| 37 |
+
|
| 38 |
+
# Configuration: Auto-backup and clear debug memory before running
|
| 39 |
+
# Set to "false" to disable this feature
|
| 40 |
+
REFRESH_DEBUG_MEMORY=${REFRESH_DEBUG_MEMORY:-true}
|
| 41 |
+
|
| 42 |
+
# Configuration: Run all benchmarks or single dataset
|
| 43 |
+
RUN_ALL_BENCHMARKS=${RUN_ALL_BENCHMARKS:-true}
|
| 44 |
+
|
| 45 |
+
# Configuration: Use offline mode for Hugging Face (avoid network calls)
|
| 46 |
+
# Set to "false" if you need to download models for the first time
|
| 47 |
+
USE_HF_OFFLINE=${USE_HF_OFFLINE:-true}
|
| 48 |
+
|
| 49 |
+
# Configuration: Run benchmarks in parallel
|
| 50 |
+
# Set to "true" to enable concurrent datasets (default: sequential datasets)
|
| 51 |
+
PARALLEL_BENCHMARKS=${PARALLEL_BENCHMARKS:-false}
|
| 52 |
+
|
| 53 |
+
# Configuration: Maximum number of parallel jobs
|
| 54 |
+
# Adjust based on your system resources
|
| 55 |
+
MAX_PARALLEL_JOBS=${MAX_PARALLEL_JOBS:-4}
|
| 56 |
+
|
| 57 |
+
# Default single dataset
|
| 58 |
+
DEFAULT_DATASET=${DATASET_NAME:-${6:-"IndustryOR"}}
|
| 59 |
+
# DEFAULT_DATASET="ComplexOR"
|
| 60 |
+
TEMPERATURE=${TEMPERATURE:-0.01}
|
| 61 |
+
MEMORY_DIR="${PROJECT_ROOT}/memory_storage"
|
| 62 |
+
MEMORY_TOP_K=${MEMORY_TOP_K:-3}
|
| 63 |
+
PARALLEL=${PARALLEL:-128}
|
| 64 |
+
MAIN_TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
| 65 |
+
OUTPUT_DIR="${OPEN_ROOT}/results/Agora-Opt/generate_and_evaluate"
|
| 66 |
+
MAX_RETRIES=${MAX_RETRIES:-5}
|
| 67 |
+
BENCHMARKS_DIR="${PROJECT_ROOT}/../../data/benchmarks"
|
| 68 |
+
EMBEDDING_MODEL=${EMBEDDING_MODEL:-}
|
| 69 |
+
|
| 70 |
+
GENERATE_CLI="${PROJECT_ROOT}/scripts/generate_with_memory.py"
|
| 71 |
+
EXECUTE_CLI="${PROJECT_ROOT}/scripts/execute.py"
|
| 72 |
+
|
| 73 |
+
if [ -d "${BENCHMARKS_DIR}" ]; then
|
| 74 |
+
BENCHMARKS_DIR="$(cd "${BENCHMARKS_DIR}" && pwd)"
|
| 75 |
+
elif [ -d "${PROJECT_ROOT}/clean_benchmarks" ]; then
|
| 76 |
+
BENCHMARKS_DIR="$(cd "${PROJECT_ROOT}/clean_benchmarks" && pwd)"
|
| 77 |
+
elif [ -d "${PROJECT_ROOT}/../clean_benchmarks" ]; then
|
| 78 |
+
BENCHMARKS_DIR="$(cd "${PROJECT_ROOT}/../clean_benchmarks" && pwd)"
|
| 79 |
+
fi
|
| 80 |
+
|
| 81 |
+
# Create output directory
|
| 82 |
+
mkdir -p "${OUTPUT_DIR}"
|
| 83 |
+
|
| 84 |
+
ensure_or_debate_env() {
|
| 85 |
+
if [ "${CONDA_DEFAULT_ENV:-}" = "or-debate" ] && command -v python >/dev/null 2>&1; then
|
| 86 |
+
return 0
|
| 87 |
+
fi
|
| 88 |
+
|
| 89 |
+
if ! command -v conda >/dev/null 2>&1; then
|
| 90 |
+
echo "❌ conda command not found. Please install Conda or activate the or-debate environment manually."
|
| 91 |
+
return 1
|
| 92 |
+
fi
|
| 93 |
+
|
| 94 |
+
local conda_bin
|
| 95 |
+
local conda_base
|
| 96 |
+
conda_bin="$(command -v conda)"
|
| 97 |
+
conda_base="$(cd "$(dirname "${conda_bin}")/.." && pwd)"
|
| 98 |
+
|
| 99 |
+
if [ -f "${conda_base}/etc/profile.d/conda.sh" ]; then
|
| 100 |
+
# shellcheck disable=SC1090
|
| 101 |
+
source "${conda_base}/etc/profile.d/conda.sh"
|
| 102 |
+
else
|
| 103 |
+
eval "$("${conda_bin}" shell.bash hook)"
|
| 104 |
+
fi
|
| 105 |
+
|
| 106 |
+
conda activate or-debate
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
# ============================================
|
| 110 |
+
# Function: Backup and Clear Debug Memory
|
| 111 |
+
# ============================================
|
| 112 |
+
backup_debug_memory() {
|
| 113 |
+
if [ "${REFRESH_DEBUG_MEMORY}" = "true" ]; then
|
| 114 |
+
DEBUG_MEMORY_FILE="${MEMORY_DIR}/debug_memory.jsonl"
|
| 115 |
+
BACKUP_DIR="${MEMORY_DIR}/backups/${MAIN_TIMESTAMP}"
|
| 116 |
+
|
| 117 |
+
if [ -f "${DEBUG_MEMORY_FILE}" ]; then
|
| 118 |
+
echo "================================================"
|
| 119 |
+
echo "🗂️ Backing up debug memory..."
|
| 120 |
+
echo "================================================"
|
| 121 |
+
|
| 122 |
+
# Create backup directory
|
| 123 |
+
mkdir -p ${BACKUP_DIR}
|
| 124 |
+
|
| 125 |
+
# Copy debug_memory.jsonl to backup
|
| 126 |
+
cp "${DEBUG_MEMORY_FILE}" "${BACKUP_DIR}/debug_memory.jsonl"
|
| 127 |
+
|
| 128 |
+
# Get file size and line count
|
| 129 |
+
FILE_SIZE=$(du -h "${DEBUG_MEMORY_FILE}" | cut -f1)
|
| 130 |
+
LINE_COUNT=$(wc -l < "${DEBUG_MEMORY_FILE}")
|
| 131 |
+
|
| 132 |
+
echo "✅ Backed up debug memory:"
|
| 133 |
+
echo " Location: ${BACKUP_DIR}/debug_memory.jsonl"
|
| 134 |
+
echo " Size: ${FILE_SIZE}"
|
| 135 |
+
echo " Lines: ${LINE_COUNT}"
|
| 136 |
+
|
| 137 |
+
# Clear the original file
|
| 138 |
+
> "${DEBUG_MEMORY_FILE}"
|
| 139 |
+
echo "✅ Cleared original debug memory file"
|
| 140 |
+
echo ""
|
| 141 |
+
else
|
| 142 |
+
echo "ℹ️ No debug memory file found, skipping backup"
|
| 143 |
+
echo ""
|
| 144 |
+
fi
|
| 145 |
+
else
|
| 146 |
+
echo "ℹ️ Debug memory refresh is disabled (REFRESH_DEBUG_MEMORY=false)"
|
| 147 |
+
echo ""
|
| 148 |
+
fi
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
normalize_dataset_name() {
|
| 152 |
+
local dataset_name="$1"
|
| 153 |
+
dataset_name="${dataset_name%.jsonl}"
|
| 154 |
+
case "${dataset_name}" in
|
| 155 |
+
ComplexLP_clean) echo "ComplexLP" ;;
|
| 156 |
+
EasyLP_clean) echo "EasyLP" ;;
|
| 157 |
+
IndustryOR_clean|IndustryOR_v2|IndustryOR_fixedV2|IndustryOR_fixedV2_clean) echo "IndustryOR" ;;
|
| 158 |
+
NL4Opt|NL4Opt_clean|NL4OPT_clean) echo "NL4OPT" ;;
|
| 159 |
+
NLP4LP_clean) echo "NLP4LP" ;;
|
| 160 |
+
ComplexOR_clean) echo "ComplexOR" ;;
|
| 161 |
+
ReSocratic_clean) echo "ReSocratic" ;;
|
| 162 |
+
combined|combined_dataset|OPT-Principled_clean) echo "OPT-Principled" ;;
|
| 163 |
+
*) echo "${dataset_name}" ;;
|
| 164 |
+
esac
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
DEFAULT_DATASET="$(normalize_dataset_name "${DEFAULT_DATASET}")"
|
| 168 |
+
|
| 169 |
+
# ============================================
|
| 170 |
+
# Function: Run single dataset (core logic)
|
| 171 |
+
# ============================================
|
| 172 |
+
process_dataset() {
|
| 173 |
+
local DATASET_NAME
|
| 174 |
+
DATASET_NAME="$(normalize_dataset_name "$1")"
|
| 175 |
+
local TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
| 176 |
+
local OUTPUT_FILE="${OUTPUT_DIR}/${MODEL}_${DATASET_NAME}_${TIMESTAMP}.jsonl"
|
| 177 |
+
local EVAL_FILE="${OUTPUT_DIR}/${MODEL}_${DATASET_NAME}_eval_${TIMESTAMP}.jsonl"
|
| 178 |
+
local EVAL_REPORT="${EVAL_FILE}/evaluation_report.json"
|
| 179 |
+
|
| 180 |
+
echo ""
|
| 181 |
+
echo "╔════════════════════════════════════════════════╗"
|
| 182 |
+
echo "║ Processing Dataset: ${DATASET_NAME}"
|
| 183 |
+
echo "╚════════════════════════════════════════════════╝"
|
| 184 |
+
echo ""
|
| 185 |
+
|
| 186 |
+
# ============================================
|
| 187 |
+
# STEP 1: Generation
|
| 188 |
+
# ============================================
|
| 189 |
+
echo "================================================"
|
| 190 |
+
echo "📝 STEP 1/2: Generating code with memory..."
|
| 191 |
+
echo "================================================"
|
| 192 |
+
echo "Dataset: ${DATASET_NAME}"
|
| 193 |
+
echo ""
|
| 194 |
+
|
| 195 |
+
local generate_args=(
|
| 196 |
+
--dataset "${DATASET_NAME}"
|
| 197 |
+
--model "${MODEL}"
|
| 198 |
+
--temperature "${TEMPERATURE}"
|
| 199 |
+
--max_problems "${MAX_PROBLEMS}"
|
| 200 |
+
--memory_dir "${MEMORY_DIR}"
|
| 201 |
+
--memory_top_k "${MEMORY_TOP_K}"
|
| 202 |
+
--parallel "${PARALLEL}"
|
| 203 |
+
--output "${OUTPUT_FILE}"
|
| 204 |
+
--max_retries "${MAX_RETRIES}"
|
| 205 |
+
--execution_timeout 60
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
if [ -n "${EMBEDDING_MODEL}" ]; then
|
| 209 |
+
generate_args+=(--embedding_model "${EMBEDDING_MODEL}")
|
| 210 |
+
fi
|
| 211 |
+
|
| 212 |
+
python "${GENERATE_CLI}" "${generate_args[@]}"
|
| 213 |
+
|
| 214 |
+
EXIT_CODE=$?
|
| 215 |
+
|
| 216 |
+
if [ ${EXIT_CODE} -ne 0 ]; then
|
| 217 |
+
echo ""
|
| 218 |
+
echo "❌ Generation failed for ${DATASET_NAME} with exit code ${EXIT_CODE}"
|
| 219 |
+
return 1
|
| 220 |
+
fi
|
| 221 |
+
|
| 222 |
+
echo ""
|
| 223 |
+
echo "✅ Generation completed for ${DATASET_NAME}!"
|
| 224 |
+
echo ""
|
| 225 |
+
|
| 226 |
+
# Show generation summary
|
| 227 |
+
if [ -f "${OUTPUT_FILE}" ]; then
|
| 228 |
+
TOTAL=$(wc -l < ${OUTPUT_FILE})
|
| 229 |
+
SUCCESS=$(grep -c '"status": "success"' "${OUTPUT_FILE}" 2>/dev/null || true)
|
| 230 |
+
if [ -z "${SUCCESS}" ]; then
|
| 231 |
+
SUCCESS=0
|
| 232 |
+
fi
|
| 233 |
+
echo "📊 Generation Summary:"
|
| 234 |
+
echo " Total problems: ${TOTAL}"
|
| 235 |
+
echo " Successful: ${SUCCESS}"
|
| 236 |
+
|
| 237 |
+
if [ "${SUCCESS}" -eq 0 ]; then
|
| 238 |
+
echo ""
|
| 239 |
+
echo "❌ Generation produced zero successful solutions for ${DATASET_NAME}"
|
| 240 |
+
echo " Refusing to continue with an incomplete run."
|
| 241 |
+
return 1
|
| 242 |
+
fi
|
| 243 |
+
fi
|
| 244 |
+
|
| 245 |
+
echo ""
|
| 246 |
+
|
| 247 |
+
# ============================================
|
| 248 |
+
# STEP 2: Evaluation
|
| 249 |
+
# ============================================
|
| 250 |
+
echo "================================================"
|
| 251 |
+
echo "🔍 STEP 2/2: Executing and evaluating..."
|
| 252 |
+
echo "================================================"
|
| 253 |
+
echo ""
|
| 254 |
+
|
| 255 |
+
local execute_args=(
|
| 256 |
+
--input_file "${OUTPUT_FILE}"
|
| 257 |
+
--output_dir "${EVAL_FILE}"
|
| 258 |
+
--num_workers "${NUM_WORKERS}"
|
| 259 |
+
--timeout "${TIMEOUT}"
|
| 260 |
+
--tolerance "${TOLERANCE}"
|
| 261 |
+
--use_relative_tolerance
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
if [ -n "${EMBEDDING_MODEL}" ]; then
|
| 265 |
+
execute_args+=(--embedding_model "${EMBEDDING_MODEL}")
|
| 266 |
+
fi
|
| 267 |
+
|
| 268 |
+
python "${EXECUTE_CLI}" "${execute_args[@]}"
|
| 269 |
+
EXIT_CODE=$?
|
| 270 |
+
|
| 271 |
+
if [ ${EXIT_CODE} -ne 0 ]; then
|
| 272 |
+
echo ""
|
| 273 |
+
echo "❌ Evaluation failed for ${DATASET_NAME} with exit code ${EXIT_CODE}"
|
| 274 |
+
return 1
|
| 275 |
+
fi
|
| 276 |
+
|
| 277 |
+
echo ""
|
| 278 |
+
echo "✅ Evaluation completed for ${DATASET_NAME}!"
|
| 279 |
+
echo ""
|
| 280 |
+
|
| 281 |
+
# Show evaluation report if exists
|
| 282 |
+
if [ -f "${EVAL_REPORT}" ]; then
|
| 283 |
+
echo "📊 Evaluation Results for ${DATASET_NAME}:"
|
| 284 |
+
cat "${EVAL_REPORT}" | jq '{
|
| 285 |
+
accuracy: .accuracy,
|
| 286 |
+
correct: .correct,
|
| 287 |
+
total: .total_problems,
|
| 288 |
+
status_counts: .status_counts
|
| 289 |
+
}' 2>/dev/null || cat "${EVAL_REPORT}"
|
| 290 |
+
echo ""
|
| 291 |
+
|
| 292 |
+
# Store results for final summary (with lock for parallel execution)
|
| 293 |
+
ACCURACY=$(cat "${EVAL_REPORT}" | jq -r '.accuracy' 2>/dev/null || echo "N/A")
|
| 294 |
+
CORRECT=$(cat "${EVAL_REPORT}" | jq -r '.correct' 2>/dev/null || echo "N/A")
|
| 295 |
+
TOTAL_PROBS=$(cat "${EVAL_REPORT}" | jq -r '.total_problems' 2>/dev/null || echo "N/A")
|
| 296 |
+
|
| 297 |
+
# Use lock to safely append to results file (fallback to simple append if flock not available)
|
| 298 |
+
RESULTS_LOCK="${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.lock"
|
| 299 |
+
if command -v flock >/dev/null 2>&1; then
|
| 300 |
+
(
|
| 301 |
+
flock -x 200
|
| 302 |
+
echo "${DATASET_NAME}|${ACCURACY}|${CORRECT}|${TOTAL_PROBS}|${EVAL_FILE}" >> "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
|
| 303 |
+
) 200>"${RESULTS_LOCK}"
|
| 304 |
+
else
|
| 305 |
+
# Fallback: use simple append (may have race condition but unlikely with small writes)
|
| 306 |
+
echo "${DATASET_NAME}|${ACCURACY}|${CORRECT}|${TOTAL_PROBS}|${EVAL_FILE}" >> "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
|
| 307 |
+
fi
|
| 308 |
+
fi
|
| 309 |
+
|
| 310 |
+
echo "================================================"
|
| 311 |
+
echo ""
|
| 312 |
+
|
| 313 |
+
if [ -f "${EVAL_REPORT}" ]; then
|
| 314 |
+
return 0
|
| 315 |
+
else
|
| 316 |
+
return 1
|
| 317 |
+
fi
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
# ============================================
|
| 321 |
+
# Function: Run single dataset (internal, supports logging)
|
| 322 |
+
# ============================================
|
| 323 |
+
run_single_dataset_internal() {
|
| 324 |
+
local DATASET_NAME=$1
|
| 325 |
+
local LOG_FILE=$2
|
| 326 |
+
local STREAM_OUTPUT=${3:-false}
|
| 327 |
+
|
| 328 |
+
if [ "${STREAM_OUTPUT}" = "true" ]; then
|
| 329 |
+
process_dataset "${DATASET_NAME}" |& tee "${LOG_FILE}"
|
| 330 |
+
local EXIT_CODE=${PIPESTATUS[0]}
|
| 331 |
+
return ${EXIT_CODE}
|
| 332 |
+
else
|
| 333 |
+
process_dataset "${DATASET_NAME}" > "${LOG_FILE}" 2>&1
|
| 334 |
+
return $?
|
| 335 |
+
fi
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
# ============================================
|
| 339 |
+
# Function: Run single dataset (wrapper for sequential execution)
|
| 340 |
+
# ============================================
|
| 341 |
+
run_single_dataset() {
|
| 342 |
+
local DATASET_NAME=$1
|
| 343 |
+
local STREAM_OUTPUT=${2:-false}
|
| 344 |
+
local LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
|
| 345 |
+
|
| 346 |
+
run_single_dataset_internal "${DATASET_NAME}" "${LOG_FILE}" "${STREAM_OUTPUT}"
|
| 347 |
+
local EXIT_CODE=$?
|
| 348 |
+
|
| 349 |
+
# Display output only when we did not already stream it live
|
| 350 |
+
if [ "${STREAM_OUTPUT}" != "true" ]; then
|
| 351 |
+
cat "${LOG_FILE}"
|
| 352 |
+
fi
|
| 353 |
+
|
| 354 |
+
return ${EXIT_CODE}
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
# ============================================
|
| 358 |
+
# Main Execution
|
| 359 |
+
# ============================================
|
| 360 |
+
|
| 361 |
+
echo "================================================"
|
| 362 |
+
echo "🚀 Generate + Evaluate Pipeline"
|
| 363 |
+
echo "================================================"
|
| 364 |
+
echo "Model: ${MODEL}"
|
| 365 |
+
echo "Max problems: ${MAX_PROBLEMS}"
|
| 366 |
+
echo "Temperature: ${TEMPERATURE}"
|
| 367 |
+
echo "Memory dir: ${MEMORY_DIR}"
|
| 368 |
+
echo "Memory Top-K: ${MEMORY_TOP_K}"
|
| 369 |
+
if [ -n "${EMBEDDING_MODEL}" ]; then
|
| 370 |
+
echo "Embedding: ${EMBEDDING_MODEL}"
|
| 371 |
+
else
|
| 372 |
+
echo "Embedding: MemoryBank default"
|
| 373 |
+
fi
|
| 374 |
+
echo "Parallel: ${PARALLEL}"
|
| 375 |
+
echo "Refresh Memory: ${REFRESH_DEBUG_MEMORY}"
|
| 376 |
+
echo "Run All Benchmarks: ${RUN_ALL_BENCHMARKS}"
|
| 377 |
+
echo "HF Offline: ${USE_HF_OFFLINE}"
|
| 378 |
+
echo "Parallel Benchmarks: ${PARALLEL_BENCHMARKS}"
|
| 379 |
+
if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
|
| 380 |
+
echo "Max Parallel Jobs: ${MAX_PARALLEL_JOBS}"
|
| 381 |
+
fi
|
| 382 |
+
echo ""
|
| 383 |
+
echo "Eval Workers: ${NUM_WORKERS}"
|
| 384 |
+
echo "Eval Timeout: ${TIMEOUT}s"
|
| 385 |
+
echo "Tolerance: ${TOLERANCE} (relative)"
|
| 386 |
+
echo ""
|
| 387 |
+
echo "Max retries: ${MAX_RETRIES}"
|
| 388 |
+
echo "================================================"
|
| 389 |
+
echo ""
|
| 390 |
+
|
| 391 |
+
# Activate environment
|
| 392 |
+
ensure_or_debate_env || exit 1
|
| 393 |
+
|
| 394 |
+
# Set Hugging Face offline mode if enabled
|
| 395 |
+
if [ "${USE_HF_OFFLINE}" = "true" ]; then
|
| 396 |
+
echo "ℹ️ Hugging Face offline mode enabled (using local cache)"
|
| 397 |
+
export HF_HUB_OFFLINE=1
|
| 398 |
+
export TRANSFORMERS_OFFLINE=1
|
| 399 |
+
export HF_DATASETS_OFFLINE=1
|
| 400 |
+
else
|
| 401 |
+
echo "ℹ️ Hugging Face online mode (may download models if needed)"
|
| 402 |
+
fi
|
| 403 |
+
echo ""
|
| 404 |
+
|
| 405 |
+
# Backup and clear debug memory (only once at the beginning)
|
| 406 |
+
backup_debug_memory
|
| 407 |
+
|
| 408 |
+
# ============================================
|
| 409 |
+
# Run benchmarks
|
| 410 |
+
# ============================================
|
| 411 |
+
if [ "${RUN_ALL_BENCHMARKS}" = "true" ]; then
|
| 412 |
+
if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
|
| 413 |
+
echo "================================================"
|
| 414 |
+
echo "🔄 Running ALL benchmarks in PARALLEL"
|
| 415 |
+
echo "================================================"
|
| 416 |
+
else
|
| 417 |
+
echo "================================================"
|
| 418 |
+
echo "🔄 Running ALL benchmarks SEQUENTIALLY"
|
| 419 |
+
echo "================================================"
|
| 420 |
+
fi
|
| 421 |
+
echo ""
|
| 422 |
+
|
| 423 |
+
# Define benchmark dataset names in specified order (without .jsonl extension)
|
| 424 |
+
# Modify this array to change the execution order
|
| 425 |
+
BENCHMARK_NAMES=(
|
| 426 |
+
"NL4OPT"
|
| 427 |
+
"EasyLP"
|
| 428 |
+
"ComplexLP"
|
| 429 |
+
"NLP4LP"
|
| 430 |
+
"ComplexOR"
|
| 431 |
+
"IndustryOR"
|
| 432 |
+
"ReSocratic"
|
| 433 |
+
"OPT-Principled"
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# Count total benchmarks
|
| 437 |
+
TOTAL_BENCHMARKS=${#BENCHMARK_NAMES[@]}
|
| 438 |
+
FAILED=0
|
| 439 |
+
SKIPPED=0
|
| 440 |
+
|
| 441 |
+
echo "Total benchmarks to process: ${TOTAL_BENCHMARKS}"
|
| 442 |
+
echo ""
|
| 443 |
+
echo "Execution order:"
|
| 444 |
+
for i in "${!BENCHMARK_NAMES[@]}"; do
|
| 445 |
+
echo " $((i+1)). ${BENCHMARK_NAMES[$i]}"
|
| 446 |
+
done
|
| 447 |
+
echo ""
|
| 448 |
+
|
| 449 |
+
# Initialize batch results file
|
| 450 |
+
echo "Dataset|Accuracy|Correct|Total|Output" > "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
|
| 451 |
+
|
| 452 |
+
# Create lock file for parallel execution
|
| 453 |
+
RESULTS_LOCK="${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.lock"
|
| 454 |
+
touch "${RESULTS_LOCK}"
|
| 455 |
+
|
| 456 |
+
# Process benchmarks (parallel or sequential)
|
| 457 |
+
if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
|
| 458 |
+
# Parallel execution
|
| 459 |
+
declare -a PIDS=()
|
| 460 |
+
declare -a DATASET_NAMES=()
|
| 461 |
+
CURRENT_JOBS=0
|
| 462 |
+
|
| 463 |
+
for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
|
| 464 |
+
BENCHMARK_FILE="${BENCHMARKS_DIR}/${DATASET_NAME}.jsonl"
|
| 465 |
+
|
| 466 |
+
# Check if file exists
|
| 467 |
+
if [ ! -f "${BENCHMARK_FILE}" ]; then
|
| 468 |
+
echo "⚠️ File not found: ${BENCHMARK_FILE}"
|
| 469 |
+
echo " Skipping ${DATASET_NAME}..."
|
| 470 |
+
SKIPPED=$((SKIPPED + 1))
|
| 471 |
+
continue
|
| 472 |
+
fi
|
| 473 |
+
|
| 474 |
+
# Wait for available slot if max jobs reached
|
| 475 |
+
while true; do
|
| 476 |
+
# Count running jobs
|
| 477 |
+
CURRENT_JOBS=0
|
| 478 |
+
for PID in "${PIDS[@]}"; do
|
| 479 |
+
if kill -0 ${PID} 2>/dev/null; then
|
| 480 |
+
CURRENT_JOBS=$((CURRENT_JOBS + 1))
|
| 481 |
+
fi
|
| 482 |
+
done
|
| 483 |
+
|
| 484 |
+
# Break if we have available slots
|
| 485 |
+
if [ ${CURRENT_JOBS} -lt ${MAX_PARALLEL_JOBS} ]; then
|
| 486 |
+
break
|
| 487 |
+
fi
|
| 488 |
+
|
| 489 |
+
# Wait a bit before checking again
|
| 490 |
+
sleep 1
|
| 491 |
+
done
|
| 492 |
+
|
| 493 |
+
# Start job in background
|
| 494 |
+
LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
|
| 495 |
+
echo "🚀 Starting ${DATASET_NAME} (log: ${LOG_FILE})"
|
| 496 |
+
|
| 497 |
+
(
|
| 498 |
+
run_single_dataset_internal "${DATASET_NAME}" "${LOG_FILE}"
|
| 499 |
+
EXIT_CODE=$?
|
| 500 |
+
if [ ${EXIT_CODE} -ne 0 ]; then
|
| 501 |
+
echo "[${DATASET_NAME}] ❌ Failed with exit code ${EXIT_CODE}" >> "${OUTPUT_DIR}/failures_${MAIN_TIMESTAMP}.txt"
|
| 502 |
+
else
|
| 503 |
+
echo "[${DATASET_NAME}] ✅ Completed successfully" >> "${OUTPUT_DIR}/success_${MAIN_TIMESTAMP}.txt"
|
| 504 |
+
fi
|
| 505 |
+
) &
|
| 506 |
+
|
| 507 |
+
PID=$!
|
| 508 |
+
PIDS+=(${PID})
|
| 509 |
+
DATASET_NAMES+=("${DATASET_NAME}")
|
| 510 |
+
done
|
| 511 |
+
|
| 512 |
+
# Wait for all jobs to complete
|
| 513 |
+
echo ""
|
| 514 |
+
echo "⏳ Waiting for all jobs to complete..."
|
| 515 |
+
echo ""
|
| 516 |
+
|
| 517 |
+
for i in "${!PIDS[@]}"; do
|
| 518 |
+
PID=${PIDS[$i]}
|
| 519 |
+
DATASET_NAME=${DATASET_NAMES[$i]}
|
| 520 |
+
wait ${PID}
|
| 521 |
+
EXIT_CODE=$?
|
| 522 |
+
if [ ${EXIT_CODE} -ne 0 ]; then
|
| 523 |
+
FAILED=$((FAILED + 1))
|
| 524 |
+
echo "⚠️ ${DATASET_NAME} failed with exit code ${EXIT_CODE}"
|
| 525 |
+
fi
|
| 526 |
+
done
|
| 527 |
+
|
| 528 |
+
# Clean up lock file
|
| 529 |
+
rm -f "${RESULTS_LOCK}"
|
| 530 |
+
|
| 531 |
+
echo ""
|
| 532 |
+
echo "================================================"
|
| 533 |
+
echo "📋 Individual Job Logs:"
|
| 534 |
+
echo "================================================"
|
| 535 |
+
for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
|
| 536 |
+
LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
|
| 537 |
+
if [ -f "${LOG_FILE}" ]; then
|
| 538 |
+
echo ""
|
| 539 |
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
| 540 |
+
echo " ${DATASET_NAME} - Log File: ${LOG_FILE}"
|
| 541 |
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
| 542 |
+
tail -20 "${LOG_FILE}"
|
| 543 |
+
fi
|
| 544 |
+
done
|
| 545 |
+
echo ""
|
| 546 |
+
|
| 547 |
+
else
|
| 548 |
+
# Sequential execution
|
| 549 |
+
CURRENT=0
|
| 550 |
+
for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
|
| 551 |
+
CURRENT=$((CURRENT + 1))
|
| 552 |
+
BENCHMARK_FILE="${BENCHMARKS_DIR}/${DATASET_NAME}.jsonl"
|
| 553 |
+
|
| 554 |
+
echo ""
|
| 555 |
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
| 556 |
+
echo " Progress: ${CURRENT}/${TOTAL_BENCHMARKS}"
|
| 557 |
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
| 558 |
+
|
| 559 |
+
# Check if file exists
|
| 560 |
+
if [ ! -f "${BENCHMARK_FILE}" ]; then
|
| 561 |
+
echo "⚠️ File not found: ${BENCHMARK_FILE}"
|
| 562 |
+
echo " Skipping..."
|
| 563 |
+
SKIPPED=$((SKIPPED + 1))
|
| 564 |
+
continue
|
| 565 |
+
fi
|
| 566 |
+
|
| 567 |
+
run_single_dataset "${DATASET_NAME}" true
|
| 568 |
+
|
| 569 |
+
if [ $? -ne 0 ]; then
|
| 570 |
+
FAILED=$((FAILED + 1))
|
| 571 |
+
echo "⚠️ Failed to process ${DATASET_NAME}, continuing..."
|
| 572 |
+
fi
|
| 573 |
+
|
| 574 |
+
echo ""
|
| 575 |
+
done
|
| 576 |
+
|
| 577 |
+
# Clean up lock file
|
| 578 |
+
rm -f "${RESULTS_LOCK}"
|
| 579 |
+
fi
|
| 580 |
+
|
| 581 |
+
# ============================================
|
| 582 |
+
# Final Summary for All Benchmarks
|
| 583 |
+
# ============================================
|
| 584 |
+
echo ""
|
| 585 |
+
echo "================================================"
|
| 586 |
+
echo "🎉 All Benchmarks Complete!"
|
| 587 |
+
echo "================================================"
|
| 588 |
+
echo ""
|
| 589 |
+
echo "Summary:"
|
| 590 |
+
echo " Total benchmarks: ${TOTAL_BENCHMARKS}"
|
| 591 |
+
echo " Successful: $((TOTAL_BENCHMARKS - FAILED - SKIPPED))"
|
| 592 |
+
echo " Failed: ${FAILED}"
|
| 593 |
+
echo " Skipped: ${SKIPPED}"
|
| 594 |
+
echo ""
|
| 595 |
+
echo "📊 Detailed Results:"
|
| 596 |
+
echo "================================================"
|
| 597 |
+
|
| 598 |
+
# Display formatted results table
|
| 599 |
+
if [ -f "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt" ]; then
|
| 600 |
+
echo ""
|
| 601 |
+
printf "%-35s | %-10s | %-10s | %-10s\n" "Dataset" "Accuracy" "Correct" "Total"
|
| 602 |
+
echo "--------------------------------------------------------------------------------"
|
| 603 |
+
tail -n +2 "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt" | while IFS='|' read -r dataset accuracy correct total output; do
|
| 604 |
+
printf "%-35s | %-10s | %-10s | %-10s\n" "${dataset}" "${accuracy}" "${correct}" "${total}"
|
| 605 |
+
done
|
| 606 |
+
echo ""
|
| 607 |
+
echo "📁 Full results saved to: ${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
|
| 608 |
+
fi
|
| 609 |
+
|
| 610 |
+
echo ""
|
| 611 |
+
echo "================================================"
|
| 612 |
+
|
| 613 |
+
else
|
| 614 |
+
# Run single dataset mode
|
| 615 |
+
echo "================================================"
|
| 616 |
+
echo "📝 Running single dataset: ${DEFAULT_DATASET}"
|
| 617 |
+
echo "================================================"
|
| 618 |
+
echo ""
|
| 619 |
+
|
| 620 |
+
BENCHMARK_FILE="${BENCHMARKS_DIR}/${DEFAULT_DATASET}.jsonl"
|
| 621 |
+
if [ ! -f "${BENCHMARK_FILE}" ]; then
|
| 622 |
+
echo "❌ Dataset file not found: ${BENCHMARK_FILE}"
|
| 623 |
+
exit 1
|
| 624 |
+
fi
|
| 625 |
+
|
| 626 |
+
run_single_dataset "${DEFAULT_DATASET}" true
|
| 627 |
+
|
| 628 |
+
if [ $? -ne 0 ]; then
|
| 629 |
+
echo ""
|
| 630 |
+
echo "❌ Pipeline failed"
|
| 631 |
+
exit 1
|
| 632 |
+
fi
|
| 633 |
+
|
| 634 |
+
echo ""
|
| 635 |
+
echo "🎉 Pipeline Complete!"
|
| 636 |
+
fi
|
| 637 |
+
|
| 638 |
+
echo ""
|
| 639 |
+
echo "✨ All done! Check the results above."
|
| 640 |
+
echo ""
|
scripts/run_memory_debate.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Wrapper for debate_memory.run_memory_debate."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 8 |
+
SRC_DIR = PROJECT_ROOT / "src"
|
| 9 |
+
if str(SRC_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 11 |
+
|
| 12 |
+
from debate_memory.run_memory_debate import main
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
main()
|
| 17 |
+
|
scripts/test_self_healing_full.sh
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
set -euo pipefail
|
| 4 |
+
|
| 5 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 6 |
+
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 7 |
+
SRC_DIR="${PROJECT_ROOT}/src"
|
| 8 |
+
export PYTHONPATH="${SRC_DIR}:${PYTHONPATH:-}"
|
| 9 |
+
GENERATE_CLI="${PROJECT_ROOT}/scripts/generate_with_memory.py"
|
| 10 |
+
|
| 11 |
+
# Test self-healing mechanism with a small sample
|
| 12 |
+
# This will test the full pipeline with just 3 problems
|
| 13 |
+
|
| 14 |
+
echo "================================================"
|
| 15 |
+
echo "🧪 Testing Self-Healing Mechanism"
|
| 16 |
+
echo "================================================"
|
| 17 |
+
echo ""
|
| 18 |
+
|
| 19 |
+
# Activate conda environment
|
| 20 |
+
source ~/miniconda3/etc/profile.d/conda.sh
|
| 21 |
+
conda activate or-debate
|
| 22 |
+
|
| 23 |
+
# Test parameters
|
| 24 |
+
MODEL="deepseek-chat"
|
| 25 |
+
DATASET="IndustryOR"
|
| 26 |
+
MAX_PROBLEMS=3
|
| 27 |
+
OUTPUT_DIR="${PROJECT_ROOT}/test_output"
|
| 28 |
+
OUTPUT_FILE="${OUTPUT_DIR}/test_self_healing_$(date +%Y%m%d_%H%M%S).jsonl"
|
| 29 |
+
MEMORY_DIR="${PROJECT_ROOT}/memory_storage"
|
| 30 |
+
MAX_RETRIES=3
|
| 31 |
+
|
| 32 |
+
mkdir -p "${OUTPUT_DIR}"
|
| 33 |
+
|
| 34 |
+
echo "Configuration:"
|
| 35 |
+
echo " Model: ${MODEL}"
|
| 36 |
+
echo " Dataset: ${DATASET}"
|
| 37 |
+
echo " Max problems: ${MAX_PROBLEMS}"
|
| 38 |
+
echo " Max retries: ${MAX_RETRIES}"
|
| 39 |
+
echo " Output: ${OUTPUT_FILE}"
|
| 40 |
+
echo ""
|
| 41 |
+
|
| 42 |
+
# Run generation with self-healing
|
| 43 |
+
set +e
|
| 44 |
+
python "${GENERATE_CLI}" \
|
| 45 |
+
--dataset "${DATASET}" \
|
| 46 |
+
--model "${MODEL}" \
|
| 47 |
+
--max_problems "${MAX_PROBLEMS}" \
|
| 48 |
+
--output "${OUTPUT_FILE}" \
|
| 49 |
+
--memory_dir "${MEMORY_DIR}" \
|
| 50 |
+
--memory_top_k 3 \
|
| 51 |
+
--parallel 1 \
|
| 52 |
+
--max_retries "${MAX_RETRIES}" \
|
| 53 |
+
--execution_timeout 60
|
| 54 |
+
EXIT_CODE=$?
|
| 55 |
+
set -e
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if [ ${EXIT_CODE} -ne 0 ]; then
|
| 59 |
+
echo ""
|
| 60 |
+
echo "❌ Test failed with exit code ${EXIT_CODE}"
|
| 61 |
+
exit 1
|
| 62 |
+
fi
|
| 63 |
+
|
| 64 |
+
echo ""
|
| 65 |
+
echo "================================================"
|
| 66 |
+
echo "📊 Test Results"
|
| 67 |
+
echo "================================================"
|
| 68 |
+
|
| 69 |
+
if [ -f "${OUTPUT_FILE}" ]; then
|
| 70 |
+
TOTAL=$(wc -l < "${OUTPUT_FILE}")
|
| 71 |
+
echo "Total problems processed: ${TOTAL}"
|
| 72 |
+
|
| 73 |
+
# Count successes
|
| 74 |
+
SUCCESS=$(grep -c '"execution_status": "success"' "${OUTPUT_FILE}" 2>/dev/null || echo 0)
|
| 75 |
+
echo "Successful executions: ${SUCCESS}"
|
| 76 |
+
|
| 77 |
+
# Count with retries
|
| 78 |
+
RETRIED=$(grep -c '"total_attempts": [2-9]' "${OUTPUT_FILE}" 2>/dev/null || echo 0)
|
| 79 |
+
echo "Problems that used retry: ${RETRIED}"
|
| 80 |
+
|
| 81 |
+
# Show sample result
|
| 82 |
+
echo ""
|
| 83 |
+
echo "Sample result (problem 1):"
|
| 84 |
+
head -1 "${OUTPUT_FILE}" | python -m json.tool | grep -E '"id"|"execution_status"|"total_attempts"|"self_healing_enabled"'
|
| 85 |
+
|
| 86 |
+
echo ""
|
| 87 |
+
echo "✅ Test completed successfully!"
|
| 88 |
+
echo "Full results saved to: ${OUTPUT_FILE}"
|
| 89 |
+
else
|
| 90 |
+
echo "❌ Output file not found: ${OUTPUT_FILE}"
|
| 91 |
+
exit 1
|
| 92 |
+
fi
|
src/debate_memory/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Debate-with-memory v2 core package."""
|
| 2 |
+
|
| 3 |
+
from importlib import metadata
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
__version__ = metadata.version("debate-memory")
|
| 7 |
+
except metadata.PackageNotFoundError: # pragma: no cover - local usage
|
| 8 |
+
__version__ = "0.0.0"
|
| 9 |
+
|
| 10 |
+
__all__ = ["__version__"]
|
| 11 |
+
|
src/debate_memory/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (404 Bytes). View file
|
|
|
src/debate_memory/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (541 Bytes). View file
|
|
|
src/debate_memory/__pycache__/build_memory_from_eval_results.cpython-311.pyc
ADDED
|
Binary file (14.4 kB). View file
|
|
|
src/debate_memory/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (4.82 kB). View file
|
|
|
src/debate_memory/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (6.6 kB). View file
|
|
|
src/debate_memory/__pycache__/debate_memory_builder.cpython-311.pyc
ADDED
|
Binary file (23.1 kB). View file
|
|
|
src/debate_memory/__pycache__/debug_executor.cpython-310.pyc
ADDED
|
Binary file (3.7 kB). View file
|
|
|
src/debate_memory/__pycache__/debug_memory.cpython-310.pyc
ADDED
|
Binary file (5.19 kB). View file
|
|
|
src/debate_memory/__pycache__/debug_memory_builder.cpython-311.pyc
ADDED
|
Binary file (8.81 kB). View file
|
|
|
src/debate_memory/__pycache__/generate_with_memory.cpython-310.pyc
ADDED
|
Binary file (24 kB). View file
|
|
|
src/debate_memory/__pycache__/generate_with_memory.cpython-311.pyc
ADDED
|
Binary file (40.9 kB). View file
|
|
|
src/debate_memory/__pycache__/llm.cpython-310.pyc
ADDED
|
Binary file (3.16 kB). View file
|
|
|
src/debate_memory/__pycache__/llm.cpython-311.pyc
ADDED
|
Binary file (4.98 kB). View file
|
|
|
src/debate_memory/__pycache__/memory_bank.cpython-310.pyc
ADDED
|
Binary file (9.19 kB). View file
|
|
|
src/debate_memory/__pycache__/memory_bank.cpython-311.pyc
ADDED
|
Binary file (15.3 kB). View file
|
|
|
src/debate_memory/__pycache__/run_memory_debate.cpython-311.pyc
ADDED
|
Binary file (27.6 kB). View file
|
|
|
src/debate_memory/augment_memory_from_standalone_runs.py
ADDED
|
@@ -0,0 +1,974 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Build non-destructive memory variants from standalone pipeline runs."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import glob
|
| 8 |
+
import hashlib
|
| 9 |
+
import json
|
| 10 |
+
import shutil
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from datetime import datetime, timezone
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
| 15 |
+
|
| 16 |
+
from llama_index.core import Document
|
| 17 |
+
|
| 18 |
+
from .memory_bank import MemoryBank
|
| 19 |
+
|
| 20 |
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 21 |
+
PROJECT_ROOT = SCRIPT_DIR.parent.parent
|
| 22 |
+
DEFAULT_BASE_ROOT = PROJECT_ROOT
|
| 23 |
+
DEFAULT_VARIANTS_ROOT = PROJECT_ROOT / "memory_variants"
|
| 24 |
+
DEFAULT_STANDALONE_ROOT = Path("/home/datagen/OR-Debate/standalone_pipeline/runs")
|
| 25 |
+
|
| 26 |
+
MAIN_MEMORY_DIRNAME = "memory_storage"
|
| 27 |
+
DEBUG_CASE_MEMORY_DIRNAME = "debug_case_memory"
|
| 28 |
+
DEBATE_MEMORY_DIRNAME = "debate_memory_storage"
|
| 29 |
+
DEBUG_MEMORY_FILENAME = "debug_memory.jsonl"
|
| 30 |
+
|
| 31 |
+
DEBUG_FAILURE_STATUSES = {
|
| 32 |
+
"execution_error",
|
| 33 |
+
"error",
|
| 34 |
+
"timeout",
|
| 35 |
+
"no_code",
|
| 36 |
+
"not_executed",
|
| 37 |
+
"success_no_objective",
|
| 38 |
+
"execution_failed",
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
PROMPT_ARTIFACT_HEADERS = (
|
| 42 |
+
"\n# Retrieved Historical Cases",
|
| 43 |
+
"\n# Debate Memory Insights",
|
| 44 |
+
"\n# Retrieved Debug Guidance",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class RunArtifacts:
|
| 50 |
+
source_root: Path
|
| 51 |
+
run_dir: Path
|
| 52 |
+
dataset: str
|
| 53 |
+
model_a: str
|
| 54 |
+
model_b: str
|
| 55 |
+
single_generated: Dict[str, Path]
|
| 56 |
+
debate_results: Optional[Path]
|
| 57 |
+
consensus_jsonl: Optional[Path]
|
| 58 |
+
consensus_eval: Optional[Path]
|
| 59 |
+
manifest_path: Optional[Path]
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def has_complete_debate(self) -> bool:
|
| 63 |
+
return bool(
|
| 64 |
+
self.debate_results
|
| 65 |
+
and self.consensus_jsonl
|
| 66 |
+
and self.consensus_eval
|
| 67 |
+
and self.debate_results.exists()
|
| 68 |
+
and self.consensus_jsonl.exists()
|
| 69 |
+
and self.consensus_eval.exists()
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class ReferenceSolution:
|
| 75 |
+
source: str
|
| 76 |
+
model: str
|
| 77 |
+
code: str
|
| 78 |
+
objective_value: Optional[float]
|
| 79 |
+
chosen_model: Optional[str]
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def now_iso() -> str:
|
| 83 |
+
return datetime.now(timezone.utc).isoformat()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def now_stamp() -> str:
|
| 87 |
+
return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def load_jsonl(path: Path) -> List[Dict]:
|
| 91 |
+
rows: List[Dict] = []
|
| 92 |
+
if not path or not path.exists():
|
| 93 |
+
return rows
|
| 94 |
+
with path.open("r", encoding="utf-8") as fh:
|
| 95 |
+
for line in fh:
|
| 96 |
+
line = line.strip()
|
| 97 |
+
if not line:
|
| 98 |
+
continue
|
| 99 |
+
try:
|
| 100 |
+
rows.append(json.loads(line))
|
| 101 |
+
except json.JSONDecodeError:
|
| 102 |
+
continue
|
| 103 |
+
return rows
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def append_jsonl(path: Path, rows: Iterable[Dict]) -> int:
|
| 107 |
+
count = 0
|
| 108 |
+
with path.open("a", encoding="utf-8") as fh:
|
| 109 |
+
for row in rows:
|
| 110 |
+
fh.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 111 |
+
count += 1
|
| 112 |
+
return count
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def load_json(path: Path) -> Dict:
|
| 116 |
+
if not path.exists():
|
| 117 |
+
return {}
|
| 118 |
+
with path.open("r", encoding="utf-8") as fh:
|
| 119 |
+
return json.load(fh)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def dump_json(path: Path, payload: Dict) -> None:
|
| 123 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
with path.open("w", encoding="utf-8") as fh:
|
| 125 |
+
json.dump(payload, fh, ensure_ascii=False, indent=2, sort_keys=True)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def count_jsonl_lines(path: Path) -> int:
|
| 129 |
+
if not path.exists():
|
| 130 |
+
return 0
|
| 131 |
+
with path.open("r", encoding="utf-8") as fh:
|
| 132 |
+
return sum(1 for _ in fh if _.strip())
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def float_or_none(value) -> Optional[float]:
|
| 136 |
+
if value is None:
|
| 137 |
+
return None
|
| 138 |
+
try:
|
| 139 |
+
return float(value)
|
| 140 |
+
except (TypeError, ValueError):
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def infer_models_from_run_name(run_name: str) -> Tuple[str, str]:
|
| 145 |
+
parts = run_name.split("_vs_")
|
| 146 |
+
if len(parts) != 2:
|
| 147 |
+
return "modelA", "modelB"
|
| 148 |
+
left = parts[0].split("_")
|
| 149 |
+
if len(left) < 2:
|
| 150 |
+
return left[-1], parts[1]
|
| 151 |
+
return "_".join(left[1:]), parts[1]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def clean_description(text: str) -> str:
|
| 155 |
+
cleaned = (text or "").strip()
|
| 156 |
+
for header in PROMPT_ARTIFACT_HEADERS:
|
| 157 |
+
pos = cleaned.find(header)
|
| 158 |
+
if pos != -1:
|
| 159 |
+
cleaned = cleaned[:pos].rstrip()
|
| 160 |
+
return cleaned
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def check_correctness(
|
| 164 |
+
pred_obj: Optional[float],
|
| 165 |
+
gt_obj: Optional[float],
|
| 166 |
+
tolerance: float,
|
| 167 |
+
use_relative_tolerance: bool,
|
| 168 |
+
) -> bool:
|
| 169 |
+
if pred_obj is None or gt_obj is None:
|
| 170 |
+
return False
|
| 171 |
+
if gt_obj == 0:
|
| 172 |
+
return abs(pred_obj) <= tolerance
|
| 173 |
+
if use_relative_tolerance:
|
| 174 |
+
return abs((pred_obj - gt_obj) / gt_obj) <= tolerance
|
| 175 |
+
return abs(pred_obj - gt_obj) <= tolerance
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def sha1_short(text: str, length: int = 16) -> str:
|
| 179 |
+
return hashlib.sha1(text.encode("utf-8")).hexdigest()[:length]
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def build_doc(problem_id: int, description: str, solution_code: str, objective_value: float, metadata: Dict) -> Document:
|
| 183 |
+
doc_text = f"""Problem: {description}
|
| 184 |
+
|
| 185 |
+
Solution approach:
|
| 186 |
+
{solution_code[:500]}...
|
| 187 |
+
|
| 188 |
+
Key features:
|
| 189 |
+
- Problem ID: {problem_id}
|
| 190 |
+
- Objective value: {objective_value}
|
| 191 |
+
- Status: Correct
|
| 192 |
+
"""
|
| 193 |
+
return Document(
|
| 194 |
+
text=doc_text,
|
| 195 |
+
metadata={
|
| 196 |
+
"problem_id": problem_id,
|
| 197 |
+
"objective_value": objective_value,
|
| 198 |
+
**metadata,
|
| 199 |
+
},
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
class BatchMemoryAppender:
|
| 204 |
+
def __init__(self, memory_dir: Path, embedding_model: str) -> None:
|
| 205 |
+
self.memory_dir = memory_dir
|
| 206 |
+
self.bank = MemoryBank(memory_dir=str(memory_dir), embedding_model=embedding_model)
|
| 207 |
+
self.pending_cases: List[Dict] = []
|
| 208 |
+
self.pending_docs: List[Document] = []
|
| 209 |
+
|
| 210 |
+
def add_case(
|
| 211 |
+
self,
|
| 212 |
+
*,
|
| 213 |
+
problem_id: int,
|
| 214 |
+
problem_desc: str,
|
| 215 |
+
solution_code: str,
|
| 216 |
+
objective_value: float,
|
| 217 |
+
metadata: Dict,
|
| 218 |
+
) -> None:
|
| 219 |
+
case = {
|
| 220 |
+
"problem_id": int(problem_id),
|
| 221 |
+
"description": problem_desc,
|
| 222 |
+
"solution_code": solution_code,
|
| 223 |
+
"objective_value": objective_value,
|
| 224 |
+
"is_correct": True,
|
| 225 |
+
"metadata": metadata,
|
| 226 |
+
}
|
| 227 |
+
self.pending_cases.append(case)
|
| 228 |
+
self.pending_docs.append(
|
| 229 |
+
build_doc(
|
| 230 |
+
problem_id=int(problem_id),
|
| 231 |
+
description=problem_desc,
|
| 232 |
+
solution_code=solution_code,
|
| 233 |
+
objective_value=objective_value,
|
| 234 |
+
metadata=metadata,
|
| 235 |
+
)
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
def finalize(self) -> int:
|
| 239 |
+
if not self.pending_cases:
|
| 240 |
+
return 0
|
| 241 |
+
with Path(self.bank.cases_file).open("a", encoding="utf-8") as fh:
|
| 242 |
+
for case in self.pending_cases:
|
| 243 |
+
fh.write(json.dumps(case, ensure_ascii=False) + "\n")
|
| 244 |
+
for doc in self.pending_docs:
|
| 245 |
+
self.bank.index.insert(doc)
|
| 246 |
+
self.bank.index.storage_context.persist(persist_dir=self.bank.index_dir)
|
| 247 |
+
added = len(self.pending_cases)
|
| 248 |
+
self.pending_cases.clear()
|
| 249 |
+
self.pending_docs.clear()
|
| 250 |
+
return added
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def resolve_source_roots(patterns: Sequence[str]) -> List[Path]:
|
| 254 |
+
resolved: List[Path] = []
|
| 255 |
+
for pattern in patterns:
|
| 256 |
+
matches = glob.glob(pattern)
|
| 257 |
+
if matches:
|
| 258 |
+
for match in matches:
|
| 259 |
+
path = Path(match)
|
| 260 |
+
if path.is_dir():
|
| 261 |
+
resolved.append(path.resolve())
|
| 262 |
+
else:
|
| 263 |
+
path = Path(pattern)
|
| 264 |
+
if path.is_dir():
|
| 265 |
+
resolved.append(path.resolve())
|
| 266 |
+
deduped = sorted({path for path in resolved})
|
| 267 |
+
return deduped
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def resolve_file(run_dir: Path, raw_value: Optional[str]) -> Optional[Path]:
|
| 271 |
+
if not raw_value:
|
| 272 |
+
return None
|
| 273 |
+
candidate = Path(raw_value)
|
| 274 |
+
if not candidate.is_absolute():
|
| 275 |
+
candidate = run_dir / candidate
|
| 276 |
+
return candidate if candidate.exists() else None
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def discover_run_artifacts(source_root: Path) -> List[RunArtifacts]:
|
| 280 |
+
runs: List[RunArtifacts] = []
|
| 281 |
+
if not source_root.exists():
|
| 282 |
+
return runs
|
| 283 |
+
|
| 284 |
+
for run_dir in sorted(source_root.iterdir()):
|
| 285 |
+
if not run_dir.is_dir():
|
| 286 |
+
continue
|
| 287 |
+
|
| 288 |
+
manifest_path = run_dir / "run_manifest.json"
|
| 289 |
+
manifest = load_json(manifest_path) if manifest_path.exists() else {}
|
| 290 |
+
|
| 291 |
+
model_a, model_b = infer_models_from_run_name(run_dir.name)
|
| 292 |
+
model_a = manifest.get("model_a", model_a)
|
| 293 |
+
model_b = manifest.get("model_b", model_b)
|
| 294 |
+
dataset = manifest.get("dataset", source_root.name)
|
| 295 |
+
|
| 296 |
+
single_generated: Dict[str, Path] = {}
|
| 297 |
+
for generated in sorted(run_dir.glob("single/*/generated.jsonl")):
|
| 298 |
+
model_name = generated.parent.name
|
| 299 |
+
single_generated[model_name] = generated
|
| 300 |
+
|
| 301 |
+
model_a_generated = resolve_file(run_dir, manifest.get("model_a_generated"))
|
| 302 |
+
model_b_generated = resolve_file(run_dir, manifest.get("model_b_generated"))
|
| 303 |
+
if model_a_generated:
|
| 304 |
+
single_generated.setdefault(model_a, model_a_generated)
|
| 305 |
+
if model_b_generated:
|
| 306 |
+
single_generated.setdefault(model_b, model_b_generated)
|
| 307 |
+
|
| 308 |
+
debate_results = run_dir / "debate" / "debate_results.jsonl"
|
| 309 |
+
if not debate_results.exists():
|
| 310 |
+
debate_results = resolve_file(run_dir, manifest.get("debate_dir"))
|
| 311 |
+
if debate_results and debate_results.is_dir():
|
| 312 |
+
debate_results = debate_results / "debate_results.jsonl"
|
| 313 |
+
if debate_results and not debate_results.exists():
|
| 314 |
+
debate_results = None
|
| 315 |
+
|
| 316 |
+
consensus_jsonl = resolve_file(run_dir, manifest.get("consensus_jsonl"))
|
| 317 |
+
if consensus_jsonl is None:
|
| 318 |
+
candidates = sorted((run_dir / "debate").glob("consensus_*.jsonl"))
|
| 319 |
+
consensus_jsonl = candidates[0] if candidates else None
|
| 320 |
+
|
| 321 |
+
consensus_eval = run_dir / "consensus_eval" / "evaluation_results.jsonl"
|
| 322 |
+
if not consensus_eval.exists():
|
| 323 |
+
consensus_eval = None
|
| 324 |
+
|
| 325 |
+
runs.append(
|
| 326 |
+
RunArtifacts(
|
| 327 |
+
source_root=source_root,
|
| 328 |
+
run_dir=run_dir,
|
| 329 |
+
dataset=dataset,
|
| 330 |
+
model_a=model_a,
|
| 331 |
+
model_b=model_b,
|
| 332 |
+
single_generated=single_generated,
|
| 333 |
+
debate_results=debate_results,
|
| 334 |
+
consensus_jsonl=consensus_jsonl,
|
| 335 |
+
consensus_eval=consensus_eval,
|
| 336 |
+
manifest_path=manifest_path if manifest_path.exists() else None,
|
| 337 |
+
)
|
| 338 |
+
)
|
| 339 |
+
return runs
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def load_existing_case_signatures(cases_file: Path) -> set[str]:
|
| 343 |
+
signatures: set[str] = set()
|
| 344 |
+
if not cases_file.exists():
|
| 345 |
+
return signatures
|
| 346 |
+
with cases_file.open("r", encoding="utf-8") as fh:
|
| 347 |
+
for line in fh:
|
| 348 |
+
line = line.strip()
|
| 349 |
+
if not line:
|
| 350 |
+
continue
|
| 351 |
+
try:
|
| 352 |
+
row = json.loads(line)
|
| 353 |
+
except json.JSONDecodeError:
|
| 354 |
+
continue
|
| 355 |
+
meta = row.get("metadata") or {}
|
| 356 |
+
for key in ("import_signature", "debate_signature"):
|
| 357 |
+
value = meta.get(key)
|
| 358 |
+
if value:
|
| 359 |
+
signatures.add(str(value))
|
| 360 |
+
return signatures
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def load_existing_debug_signatures(debug_memory_file: Path) -> set[str]:
|
| 364 |
+
signatures: set[str] = set()
|
| 365 |
+
if not debug_memory_file.exists():
|
| 366 |
+
return signatures
|
| 367 |
+
with debug_memory_file.open("r", encoding="utf-8") as fh:
|
| 368 |
+
for line in fh:
|
| 369 |
+
line = line.strip()
|
| 370 |
+
if not line:
|
| 371 |
+
continue
|
| 372 |
+
try:
|
| 373 |
+
row = json.loads(line)
|
| 374 |
+
except json.JSONDecodeError:
|
| 375 |
+
continue
|
| 376 |
+
signature = row.get("signature")
|
| 377 |
+
if signature:
|
| 378 |
+
signatures.add(str(signature))
|
| 379 |
+
return signatures
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def summarize_rounds(rounds: List[Dict], max_chars: int = 1800) -> str:
|
| 383 |
+
if not rounds:
|
| 384 |
+
return ""
|
| 385 |
+
lines: List[str] = []
|
| 386 |
+
for rnd in rounds:
|
| 387 |
+
lines.append(
|
| 388 |
+
f"Round {rnd.get('round')}: "
|
| 389 |
+
f"A={rnd.get('result_A')} ({rnd.get('status_A')}), "
|
| 390 |
+
f"B={rnd.get('result_B')} ({rnd.get('status_B')})"
|
| 391 |
+
)
|
| 392 |
+
analysis_a = (rnd.get("analysis_A") or "").strip()
|
| 393 |
+
analysis_b = (rnd.get("analysis_B") or "").strip()
|
| 394 |
+
if analysis_a:
|
| 395 |
+
lines.append(f"Model A analysis:\n{analysis_a}")
|
| 396 |
+
if analysis_b:
|
| 397 |
+
lines.append(f"Model B analysis:\n{analysis_b}")
|
| 398 |
+
lines.append("")
|
| 399 |
+
text = "\n".join(lines).strip()
|
| 400 |
+
if len(text) <= max_chars:
|
| 401 |
+
return text
|
| 402 |
+
return text[: max_chars - 16] + "\n...\n(truncated)"
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def heuristic_debate_summary(entry: Dict, model_a: str, model_b: str) -> Dict:
|
| 406 |
+
initial_a = entry.get("initial_A_result")
|
| 407 |
+
initial_b = entry.get("initial_B_result")
|
| 408 |
+
final_result = entry.get("final_result")
|
| 409 |
+
chosen_model = entry.get("chosen_model") or "consensus"
|
| 410 |
+
rounds = entry.get("debate_rounds") or []
|
| 411 |
+
summary = (
|
| 412 |
+
f"Initial mismatch: {model_a}={initial_a}, {model_b}={initial_b}. "
|
| 413 |
+
f"Debate converged in {len(rounds)} rounds and selected {chosen_model} "
|
| 414 |
+
f"with final objective {final_result}."
|
| 415 |
+
)
|
| 416 |
+
decisive_argument = (
|
| 417 |
+
f"The final candidate from {chosen_model} was retained after both sides "
|
| 418 |
+
"aligned on the same executable outcome."
|
| 419 |
+
)
|
| 420 |
+
guardrails = [
|
| 421 |
+
"Compare feasibility and objective values before rewriting the model.",
|
| 422 |
+
"Keep a stable executable candidate whenever later edits do not improve the result.",
|
| 423 |
+
]
|
| 424 |
+
return {
|
| 425 |
+
"summary": summary,
|
| 426 |
+
"mismatch_reason": "The two models initially disagreed on the objective value or feasibility.",
|
| 427 |
+
"decisive_argument": decisive_argument,
|
| 428 |
+
"guardrails": guardrails,
|
| 429 |
+
"modeling_patterns": [],
|
| 430 |
+
"history_excerpt": summarize_rounds(rounds),
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
def guidance_for_status(status: str) -> str:
|
| 435 |
+
status = (status or "").strip()
|
| 436 |
+
if status == "no_code":
|
| 437 |
+
return "Return a complete executable Python program inside a ```python``` block."
|
| 438 |
+
if status == "success_no_objective":
|
| 439 |
+
return "Print the optimized objective explicitly, for example with OBJECTIVE_VALUE after optimize()."
|
| 440 |
+
if status == "timeout":
|
| 441 |
+
return "Reduce model-construction overhead and check whether loops or constraints are exploding combinatorially."
|
| 442 |
+
if status == "not_executed":
|
| 443 |
+
return "Make sure the generated response contains runnable code and that the execution step is actually triggered."
|
| 444 |
+
return "Check imports, indexing, variable names, and model-object references against the traceback."
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
def has_disagreement(initial_a: Optional[float], initial_b: Optional[float], tolerance: float) -> bool:
|
| 448 |
+
if initial_a is None or initial_b is None:
|
| 449 |
+
return True
|
| 450 |
+
return abs(initial_a - initial_b) > tolerance
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
def choose_error_text(row: Dict) -> str:
|
| 454 |
+
stderr = (row.get("execution_stderr") or "").strip()
|
| 455 |
+
stdout = (row.get("execution_stdout") or "").strip()
|
| 456 |
+
status = (row.get("execution_status") or row.get("status") or "").strip()
|
| 457 |
+
if stderr:
|
| 458 |
+
return stderr
|
| 459 |
+
if stdout:
|
| 460 |
+
return stdout
|
| 461 |
+
if status == "no_code":
|
| 462 |
+
return "Generated code block is empty."
|
| 463 |
+
if status == "not_executed":
|
| 464 |
+
return "Execution did not complete and no detailed stderr/stdout was recorded."
|
| 465 |
+
if status == "success_no_objective":
|
| 466 |
+
return "Execution succeeded but no objective value could be extracted from stdout."
|
| 467 |
+
return status or "Unknown execution issue."
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def clone_base_memory_dirs(base_root: Path, variant_dir: Path) -> Dict[str, Path]:
|
| 471 |
+
mapping = {}
|
| 472 |
+
for dirname in (MAIN_MEMORY_DIRNAME, DEBUG_CASE_MEMORY_DIRNAME, DEBATE_MEMORY_DIRNAME):
|
| 473 |
+
src = base_root / dirname
|
| 474 |
+
dst = variant_dir / dirname
|
| 475 |
+
shutil.copytree(src, dst)
|
| 476 |
+
mapping[dirname] = dst
|
| 477 |
+
return mapping
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def main() -> None:
|
| 481 |
+
parser = argparse.ArgumentParser(
|
| 482 |
+
description="Create augmented memory-bank variants from standalone pipeline runs without touching originals."
|
| 483 |
+
)
|
| 484 |
+
parser.add_argument(
|
| 485 |
+
"--variant_name",
|
| 486 |
+
type=str,
|
| 487 |
+
required=True,
|
| 488 |
+
help="Name of the output variant directory under memory_variants/",
|
| 489 |
+
)
|
| 490 |
+
parser.add_argument(
|
| 491 |
+
"--source",
|
| 492 |
+
nargs="+",
|
| 493 |
+
required=True,
|
| 494 |
+
help="Source directories or glob patterns under standalone_pipeline/runs.",
|
| 495 |
+
)
|
| 496 |
+
parser.add_argument(
|
| 497 |
+
"--base_root",
|
| 498 |
+
type=str,
|
| 499 |
+
default=str(DEFAULT_BASE_ROOT),
|
| 500 |
+
help="Project root that contains memory_storage/debug_case_memory/debate_memory_storage.",
|
| 501 |
+
)
|
| 502 |
+
parser.add_argument(
|
| 503 |
+
"--variants_root",
|
| 504 |
+
type=str,
|
| 505 |
+
default=str(DEFAULT_VARIANTS_ROOT),
|
| 506 |
+
help="Directory under which new variants are created.",
|
| 507 |
+
)
|
| 508 |
+
parser.add_argument(
|
| 509 |
+
"--embedding_model",
|
| 510 |
+
type=str,
|
| 511 |
+
default="BAAI/bge-small-en-v1.5",
|
| 512 |
+
help="Embedding model name or local path used when updating vector indexes.",
|
| 513 |
+
)
|
| 514 |
+
parser.add_argument(
|
| 515 |
+
"--tolerance",
|
| 516 |
+
type=float,
|
| 517 |
+
default=0.05,
|
| 518 |
+
help="Correctness tolerance for imported single-model cases.",
|
| 519 |
+
)
|
| 520 |
+
parser.add_argument(
|
| 521 |
+
"--mismatch_tolerance",
|
| 522 |
+
type=float,
|
| 523 |
+
default=1e-3,
|
| 524 |
+
help="Minimum difference between initial debate results to count as a disagreement.",
|
| 525 |
+
)
|
| 526 |
+
parser.add_argument(
|
| 527 |
+
"--use_relative_tolerance",
|
| 528 |
+
action="store_true",
|
| 529 |
+
help="Use relative tolerance when judging single-model correctness.",
|
| 530 |
+
)
|
| 531 |
+
args = parser.parse_args()
|
| 532 |
+
|
| 533 |
+
base_root = Path(args.base_root).resolve()
|
| 534 |
+
variants_root = Path(args.variants_root).resolve()
|
| 535 |
+
source_roots = resolve_source_roots(args.source)
|
| 536 |
+
if not source_roots:
|
| 537 |
+
raise FileNotFoundError(f"No source roots matched: {args.source}")
|
| 538 |
+
|
| 539 |
+
variant_dir = variants_root / args.variant_name
|
| 540 |
+
if variant_dir.exists():
|
| 541 |
+
raise FileExistsError(f"Variant already exists: {variant_dir}")
|
| 542 |
+
variant_dir.parent.mkdir(parents=True, exist_ok=True)
|
| 543 |
+
|
| 544 |
+
print("=== Augment Standalone Memory Banks ===")
|
| 545 |
+
print(f"Base root: {base_root}")
|
| 546 |
+
print(f"Variant dir: {variant_dir}")
|
| 547 |
+
print(f"Source roots: {len(source_roots)}")
|
| 548 |
+
for root in source_roots:
|
| 549 |
+
print(f" - {root}")
|
| 550 |
+
|
| 551 |
+
memory_dirs = clone_base_memory_dirs(base_root, variant_dir)
|
| 552 |
+
|
| 553 |
+
main_memory_dir = memory_dirs[MAIN_MEMORY_DIRNAME]
|
| 554 |
+
debug_case_memory_dir = memory_dirs[DEBUG_CASE_MEMORY_DIRNAME]
|
| 555 |
+
debate_memory_dir = memory_dirs[DEBATE_MEMORY_DIRNAME]
|
| 556 |
+
debug_memory_file = main_memory_dir / DEBUG_MEMORY_FILENAME
|
| 557 |
+
|
| 558 |
+
main_seen = load_existing_case_signatures(main_memory_dir / "cases.jsonl")
|
| 559 |
+
debug_case_seen = load_existing_case_signatures(debug_case_memory_dir / "cases.jsonl")
|
| 560 |
+
debate_seen = load_existing_case_signatures(debate_memory_dir / "cases.jsonl")
|
| 561 |
+
debug_raw_seen = load_existing_debug_signatures(debug_memory_file)
|
| 562 |
+
|
| 563 |
+
main_appender = BatchMemoryAppender(main_memory_dir, args.embedding_model)
|
| 564 |
+
debug_case_appender = BatchMemoryAppender(debug_case_memory_dir, args.embedding_model)
|
| 565 |
+
debate_appender = BatchMemoryAppender(debate_memory_dir, args.embedding_model)
|
| 566 |
+
pending_debug_rows: List[Dict] = []
|
| 567 |
+
|
| 568 |
+
stats = {
|
| 569 |
+
"runs": {
|
| 570 |
+
"source_roots": len(source_roots),
|
| 571 |
+
"runs_discovered": 0,
|
| 572 |
+
"runs_with_manifest": 0,
|
| 573 |
+
"runs_with_complete_debate": 0,
|
| 574 |
+
"runs_partial_or_single_only": 0,
|
| 575 |
+
},
|
| 576 |
+
"memory_storage": {
|
| 577 |
+
"single_correct_added": 0,
|
| 578 |
+
"consensus_correct_added": 0,
|
| 579 |
+
"duplicates_skipped": 0,
|
| 580 |
+
"incorrect_or_missing_single_skipped": 0,
|
| 581 |
+
"consensus_missing_code_or_eval_skipped": 0,
|
| 582 |
+
},
|
| 583 |
+
"debug_memory": {
|
| 584 |
+
"raw_records_added": 0,
|
| 585 |
+
"case_records_added": 0,
|
| 586 |
+
"duplicates_skipped": 0,
|
| 587 |
+
"non_failure_skipped": 0,
|
| 588 |
+
"missing_reference_skipped": 0,
|
| 589 |
+
},
|
| 590 |
+
"debate_memory": {
|
| 591 |
+
"added": 0,
|
| 592 |
+
"duplicates_skipped": 0,
|
| 593 |
+
"missing_or_incorrect_skipped": 0,
|
| 594 |
+
},
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
all_runs: List[RunArtifacts] = []
|
| 598 |
+
for source_root in source_roots:
|
| 599 |
+
all_runs.extend(discover_run_artifacts(source_root))
|
| 600 |
+
|
| 601 |
+
stats["runs"]["runs_discovered"] = len(all_runs)
|
| 602 |
+
stats["runs"]["runs_with_manifest"] = sum(1 for run in all_runs if run.manifest_path)
|
| 603 |
+
stats["runs"]["runs_with_complete_debate"] = sum(1 for run in all_runs if run.has_complete_debate)
|
| 604 |
+
stats["runs"]["runs_partial_or_single_only"] = stats["runs"]["runs_discovered"] - stats["runs"]["runs_with_complete_debate"]
|
| 605 |
+
|
| 606 |
+
for run in all_runs:
|
| 607 |
+
print(f"Processing run: {run.run_dir}")
|
| 608 |
+
|
| 609 |
+
single_rows_by_model: Dict[str, Dict[int, Dict]] = {}
|
| 610 |
+
correct_single_refs: Dict[int, Dict[str, ReferenceSolution]] = {}
|
| 611 |
+
|
| 612 |
+
for model_name, generated_path in sorted(run.single_generated.items()):
|
| 613 |
+
rows_map: Dict[int, Dict] = {}
|
| 614 |
+
for row in load_jsonl(generated_path):
|
| 615 |
+
problem_id = row.get("id")
|
| 616 |
+
if problem_id is None:
|
| 617 |
+
continue
|
| 618 |
+
try:
|
| 619 |
+
problem_id = int(problem_id)
|
| 620 |
+
except (TypeError, ValueError):
|
| 621 |
+
continue
|
| 622 |
+
rows_map[problem_id] = row
|
| 623 |
+
|
| 624 |
+
code = (row.get("generated_code") or "").strip()
|
| 625 |
+
pred = float_or_none(row.get("execution_objective_value"))
|
| 626 |
+
gt = float_or_none(row.get("answer"))
|
| 627 |
+
is_correct = bool(code) and check_correctness(
|
| 628 |
+
pred,
|
| 629 |
+
gt,
|
| 630 |
+
tolerance=args.tolerance,
|
| 631 |
+
use_relative_tolerance=args.use_relative_tolerance,
|
| 632 |
+
)
|
| 633 |
+
if not is_correct:
|
| 634 |
+
stats["memory_storage"]["incorrect_or_missing_single_skipped"] += 1
|
| 635 |
+
continue
|
| 636 |
+
|
| 637 |
+
description = clean_description(row.get("description", ""))
|
| 638 |
+
signature_basis = (
|
| 639 |
+
f"main|single|{run.dataset}|{problem_id}|{model_name}|"
|
| 640 |
+
f"{sha1_short(code, 20)}|{pred}"
|
| 641 |
+
)
|
| 642 |
+
import_signature = f"standalone-main:{sha1_short(signature_basis, 20)}"
|
| 643 |
+
if import_signature in main_seen:
|
| 644 |
+
stats["memory_storage"]["duplicates_skipped"] += 1
|
| 645 |
+
continue
|
| 646 |
+
|
| 647 |
+
metadata = {
|
| 648 |
+
"source": "standalone_single_generated",
|
| 649 |
+
"dataset": run.dataset,
|
| 650 |
+
"run_dir": str(run.run_dir),
|
| 651 |
+
"run_name": run.run_dir.name,
|
| 652 |
+
"source_root": str(run.source_root),
|
| 653 |
+
"model": model_name,
|
| 654 |
+
"execution_status": row.get("execution_status", "unknown"),
|
| 655 |
+
"ground_truth": row.get("answer"),
|
| 656 |
+
"case_kind": "single",
|
| 657 |
+
"import_signature": import_signature,
|
| 658 |
+
}
|
| 659 |
+
main_appender.add_case(
|
| 660 |
+
problem_id=problem_id,
|
| 661 |
+
problem_desc=description,
|
| 662 |
+
solution_code=code,
|
| 663 |
+
objective_value=pred if pred is not None else 0.0,
|
| 664 |
+
metadata=metadata,
|
| 665 |
+
)
|
| 666 |
+
main_seen.add(import_signature)
|
| 667 |
+
stats["memory_storage"]["single_correct_added"] += 1
|
| 668 |
+
correct_single_refs.setdefault(problem_id, {})[model_name] = ReferenceSolution(
|
| 669 |
+
source="single",
|
| 670 |
+
model=model_name,
|
| 671 |
+
code=code,
|
| 672 |
+
objective_value=pred,
|
| 673 |
+
chosen_model=model_name,
|
| 674 |
+
)
|
| 675 |
+
|
| 676 |
+
single_rows_by_model[model_name] = rows_map
|
| 677 |
+
|
| 678 |
+
consensus_rows_by_id: Dict[int, Dict] = {}
|
| 679 |
+
debate_rows_by_id: Dict[int, Dict] = {}
|
| 680 |
+
eval_rows_by_id: Dict[int, Dict] = {}
|
| 681 |
+
consensus_refs: Dict[int, ReferenceSolution] = {}
|
| 682 |
+
|
| 683 |
+
if run.has_complete_debate:
|
| 684 |
+
for row in load_jsonl(run.consensus_jsonl):
|
| 685 |
+
problem_id = row.get("id")
|
| 686 |
+
if problem_id is None:
|
| 687 |
+
continue
|
| 688 |
+
try:
|
| 689 |
+
consensus_rows_by_id[int(problem_id)] = row
|
| 690 |
+
except (TypeError, ValueError):
|
| 691 |
+
continue
|
| 692 |
+
for row in load_jsonl(run.debate_results):
|
| 693 |
+
problem_id = row.get("problem_id")
|
| 694 |
+
if problem_id is None:
|
| 695 |
+
continue
|
| 696 |
+
try:
|
| 697 |
+
debate_rows_by_id[int(problem_id)] = row
|
| 698 |
+
except (TypeError, ValueError):
|
| 699 |
+
continue
|
| 700 |
+
for row in load_jsonl(run.consensus_eval):
|
| 701 |
+
problem_id = row.get("id")
|
| 702 |
+
if problem_id is None:
|
| 703 |
+
continue
|
| 704 |
+
try:
|
| 705 |
+
eval_rows_by_id[int(problem_id)] = row
|
| 706 |
+
except (TypeError, ValueError):
|
| 707 |
+
continue
|
| 708 |
+
|
| 709 |
+
for problem_id, eval_row in eval_rows_by_id.items():
|
| 710 |
+
if not eval_row.get("is_correct", False):
|
| 711 |
+
stats["memory_storage"]["consensus_missing_code_or_eval_skipped"] += 1
|
| 712 |
+
continue
|
| 713 |
+
|
| 714 |
+
consensus_row = consensus_rows_by_id.get(problem_id, {})
|
| 715 |
+
debate_row = debate_rows_by_id.get(problem_id, {})
|
| 716 |
+
code = (consensus_row.get("generated_code") or debate_row.get("final_code") or "").strip()
|
| 717 |
+
if not code:
|
| 718 |
+
stats["memory_storage"]["consensus_missing_code_or_eval_skipped"] += 1
|
| 719 |
+
continue
|
| 720 |
+
|
| 721 |
+
description = clean_description(
|
| 722 |
+
consensus_row.get("description")
|
| 723 |
+
or next(
|
| 724 |
+
(
|
| 725 |
+
model_rows[problem_id].get("description")
|
| 726 |
+
for model_rows in single_rows_by_model.values()
|
| 727 |
+
if problem_id in model_rows
|
| 728 |
+
),
|
| 729 |
+
f"{run.dataset} problem {problem_id}",
|
| 730 |
+
)
|
| 731 |
+
)
|
| 732 |
+
pred = float_or_none(eval_row.get("predicted_objective"))
|
| 733 |
+
signature_basis = (
|
| 734 |
+
f"main|consensus|{run.dataset}|{problem_id}|"
|
| 735 |
+
f"{sha1_short(code, 20)}|{pred}"
|
| 736 |
+
)
|
| 737 |
+
import_signature = f"standalone-main:{sha1_short(signature_basis, 20)}"
|
| 738 |
+
if import_signature in main_seen:
|
| 739 |
+
stats["memory_storage"]["duplicates_skipped"] += 1
|
| 740 |
+
else:
|
| 741 |
+
metadata = {
|
| 742 |
+
"source": "standalone_consensus_eval",
|
| 743 |
+
"dataset": run.dataset,
|
| 744 |
+
"run_dir": str(run.run_dir),
|
| 745 |
+
"run_name": run.run_dir.name,
|
| 746 |
+
"source_root": str(run.source_root),
|
| 747 |
+
"modelA": run.model_a,
|
| 748 |
+
"modelB": run.model_b,
|
| 749 |
+
"chosen_model": debate_row.get("chosen_model") or consensus_row.get("chosen_model"),
|
| 750 |
+
"execution_status": eval_row.get("execution_status", "unknown"),
|
| 751 |
+
"ground_truth": eval_row.get("ground_truth"),
|
| 752 |
+
"case_kind": "consensus",
|
| 753 |
+
"import_signature": import_signature,
|
| 754 |
+
}
|
| 755 |
+
main_appender.add_case(
|
| 756 |
+
problem_id=problem_id,
|
| 757 |
+
problem_desc=description,
|
| 758 |
+
solution_code=code,
|
| 759 |
+
objective_value=pred if pred is not None else 0.0,
|
| 760 |
+
metadata=metadata,
|
| 761 |
+
)
|
| 762 |
+
main_seen.add(import_signature)
|
| 763 |
+
stats["memory_storage"]["consensus_correct_added"] += 1
|
| 764 |
+
|
| 765 |
+
consensus_refs[problem_id] = ReferenceSolution(
|
| 766 |
+
source="consensus",
|
| 767 |
+
model="debate_consensus",
|
| 768 |
+
code=code,
|
| 769 |
+
objective_value=pred,
|
| 770 |
+
chosen_model=debate_row.get("chosen_model") or consensus_row.get("chosen_model"),
|
| 771 |
+
)
|
| 772 |
+
|
| 773 |
+
for problem_id, debate_row in debate_rows_by_id.items():
|
| 774 |
+
eval_row = eval_rows_by_id.get(problem_id)
|
| 775 |
+
if not eval_row or not eval_row.get("is_correct", False):
|
| 776 |
+
stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
|
| 777 |
+
continue
|
| 778 |
+
if not debate_row.get("converged"):
|
| 779 |
+
stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
|
| 780 |
+
continue
|
| 781 |
+
initial_a = float_or_none(debate_row.get("initial_A_result"))
|
| 782 |
+
initial_b = float_or_none(debate_row.get("initial_B_result"))
|
| 783 |
+
if not has_disagreement(initial_a, initial_b, args.mismatch_tolerance):
|
| 784 |
+
stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
|
| 785 |
+
continue
|
| 786 |
+
|
| 787 |
+
final_code = (debate_row.get("final_code") or "").strip()
|
| 788 |
+
if not final_code:
|
| 789 |
+
stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
|
| 790 |
+
continue
|
| 791 |
+
|
| 792 |
+
base_desc = clean_description(
|
| 793 |
+
consensus_rows_by_id.get(problem_id, {}).get("description")
|
| 794 |
+
or next(
|
| 795 |
+
(
|
| 796 |
+
model_rows[problem_id].get("description")
|
| 797 |
+
for model_rows in single_rows_by_model.values()
|
| 798 |
+
if problem_id in model_rows
|
| 799 |
+
),
|
| 800 |
+
f"{run.dataset} problem {problem_id}",
|
| 801 |
+
)
|
| 802 |
+
)
|
| 803 |
+
summary_payload = heuristic_debate_summary(debate_row, run.model_a, run.model_b)
|
| 804 |
+
full_desc = (
|
| 805 |
+
f"{base_desc}\n\n# Debate Memory Summary\n"
|
| 806 |
+
f"{summary_payload.get('summary', '').strip()}"
|
| 807 |
+
).strip()
|
| 808 |
+
debate_signature = (
|
| 809 |
+
f"standalone-debate:{run.dataset}:{problem_id}:{sha1_short(final_code, 20)}"
|
| 810 |
+
)
|
| 811 |
+
if debate_signature in debate_seen:
|
| 812 |
+
stats["debate_memory"]["duplicates_skipped"] += 1
|
| 813 |
+
continue
|
| 814 |
+
|
| 815 |
+
metadata = {
|
| 816 |
+
"source": "standalone_debate_memory_import",
|
| 817 |
+
"dataset": run.dataset,
|
| 818 |
+
"run_dir": str(run.run_dir),
|
| 819 |
+
"run_name": run.run_dir.name,
|
| 820 |
+
"source_root": str(run.source_root),
|
| 821 |
+
"modelA": run.model_a,
|
| 822 |
+
"modelB": run.model_b,
|
| 823 |
+
"initial_A_result": initial_a,
|
| 824 |
+
"initial_B_result": initial_b,
|
| 825 |
+
"ground_truth": eval_row.get("ground_truth"),
|
| 826 |
+
"debate_signature": debate_signature,
|
| 827 |
+
"import_signature": debate_signature,
|
| 828 |
+
"summary": summary_payload,
|
| 829 |
+
}
|
| 830 |
+
debate_appender.add_case(
|
| 831 |
+
problem_id=problem_id,
|
| 832 |
+
problem_desc=full_desc,
|
| 833 |
+
solution_code=final_code,
|
| 834 |
+
objective_value=float_or_none(debate_row.get("final_result")) or 0.0,
|
| 835 |
+
metadata=metadata,
|
| 836 |
+
)
|
| 837 |
+
debate_seen.add(debate_signature)
|
| 838 |
+
stats["debate_memory"]["added"] += 1
|
| 839 |
+
|
| 840 |
+
for model_name, rows_map in sorted(single_rows_by_model.items()):
|
| 841 |
+
for problem_id, row in rows_map.items():
|
| 842 |
+
status = row.get("execution_status") or row.get("status") or ""
|
| 843 |
+
if status not in DEBUG_FAILURE_STATUSES:
|
| 844 |
+
stats["debug_memory"]["non_failure_skipped"] += 1
|
| 845 |
+
continue
|
| 846 |
+
|
| 847 |
+
reference: Optional[ReferenceSolution] = None
|
| 848 |
+
for other_model, ref in sorted(correct_single_refs.get(problem_id, {}).items()):
|
| 849 |
+
if other_model != model_name:
|
| 850 |
+
reference = ref
|
| 851 |
+
break
|
| 852 |
+
if reference is None:
|
| 853 |
+
reference = consensus_refs.get(problem_id)
|
| 854 |
+
if reference is None:
|
| 855 |
+
stats["debug_memory"]["missing_reference_skipped"] += 1
|
| 856 |
+
continue
|
| 857 |
+
|
| 858 |
+
description = clean_description(row.get("description", ""))
|
| 859 |
+
error_text = choose_error_text(row)
|
| 860 |
+
guidance = (
|
| 861 |
+
f"{guidance_for_status(status)} "
|
| 862 |
+
f"Reference fix source: {reference.source} ({reference.model}); "
|
| 863 |
+
f"target objective: {reference.objective_value}."
|
| 864 |
+
)
|
| 865 |
+
import_signature = (
|
| 866 |
+
f"standalone-debug:{sha1_short(f'{run.dataset}|{problem_id}|{model_name}|{status}|{error_text}|{sha1_short(reference.code, 16)}', 20)}"
|
| 867 |
+
)
|
| 868 |
+
if import_signature in debug_case_seen or import_signature in debug_raw_seen:
|
| 869 |
+
stats["debug_memory"]["duplicates_skipped"] += 1
|
| 870 |
+
continue
|
| 871 |
+
|
| 872 |
+
debug_record = {
|
| 873 |
+
"signature": import_signature,
|
| 874 |
+
"status": status,
|
| 875 |
+
"error_text": error_text,
|
| 876 |
+
"guidance": guidance,
|
| 877 |
+
"problem_id": problem_id,
|
| 878 |
+
"description": description,
|
| 879 |
+
"metadata": {
|
| 880 |
+
"source": "standalone_runs.synthetic_debug_case",
|
| 881 |
+
"dataset": run.dataset,
|
| 882 |
+
"run_dir": str(run.run_dir),
|
| 883 |
+
"run_name": run.run_dir.name,
|
| 884 |
+
"source_root": str(run.source_root),
|
| 885 |
+
"model": model_name,
|
| 886 |
+
"reference_source": reference.source,
|
| 887 |
+
"reference_model": reference.model,
|
| 888 |
+
"reference_objective": reference.objective_value,
|
| 889 |
+
"reference_chosen_model": reference.chosen_model,
|
| 890 |
+
},
|
| 891 |
+
"timestamp": now_iso(),
|
| 892 |
+
}
|
| 893 |
+
pending_debug_rows.append(debug_record)
|
| 894 |
+
debug_raw_seen.add(import_signature)
|
| 895 |
+
|
| 896 |
+
prompt_desc = (
|
| 897 |
+
f"{description}\n\n"
|
| 898 |
+
f"## Error Details\n```\n{error_text}\n```\n"
|
| 899 |
+
f"## Guidance\n{guidance}\n"
|
| 900 |
+
)
|
| 901 |
+
reference_code = reference.code.strip()
|
| 902 |
+
solution_code = (
|
| 903 |
+
"# Synthetic Debug Memory Case\n"
|
| 904 |
+
f"# Signature: {import_signature}\n"
|
| 905 |
+
f"# Status: {status}\n"
|
| 906 |
+
f"# Reference source: {reference.source} ({reference.model})\n\n"
|
| 907 |
+
f"{reference_code}"
|
| 908 |
+
)
|
| 909 |
+
metadata = {
|
| 910 |
+
"source": "standalone_runs.synthetic_debug_case",
|
| 911 |
+
"dataset": run.dataset,
|
| 912 |
+
"run_dir": str(run.run_dir),
|
| 913 |
+
"run_name": run.run_dir.name,
|
| 914 |
+
"source_root": str(run.source_root),
|
| 915 |
+
"model": model_name,
|
| 916 |
+
"status": status,
|
| 917 |
+
"signature": import_signature,
|
| 918 |
+
"reference_source": reference.source,
|
| 919 |
+
"reference_model": reference.model,
|
| 920 |
+
"reference_objective": reference.objective_value,
|
| 921 |
+
"reference_chosen_model": reference.chosen_model,
|
| 922 |
+
"import_signature": import_signature,
|
| 923 |
+
}
|
| 924 |
+
debug_case_appender.add_case(
|
| 925 |
+
problem_id=problem_id,
|
| 926 |
+
problem_desc=prompt_desc,
|
| 927 |
+
solution_code=solution_code,
|
| 928 |
+
objective_value=0.0,
|
| 929 |
+
metadata=metadata,
|
| 930 |
+
)
|
| 931 |
+
debug_case_seen.add(import_signature)
|
| 932 |
+
stats["debug_memory"]["raw_records_added"] += 1
|
| 933 |
+
stats["debug_memory"]["case_records_added"] += 1
|
| 934 |
+
|
| 935 |
+
append_jsonl(debug_memory_file, pending_debug_rows)
|
| 936 |
+
|
| 937 |
+
main_added = main_appender.finalize()
|
| 938 |
+
debug_case_added = debug_case_appender.finalize()
|
| 939 |
+
debate_added = debate_appender.finalize()
|
| 940 |
+
|
| 941 |
+
summary = {
|
| 942 |
+
"created_at": now_iso(),
|
| 943 |
+
"variant_dir": str(variant_dir),
|
| 944 |
+
"base_root": str(base_root),
|
| 945 |
+
"source_patterns": list(args.source),
|
| 946 |
+
"resolved_source_roots": [str(path) for path in source_roots],
|
| 947 |
+
"embedding_model": args.embedding_model,
|
| 948 |
+
"tolerance": args.tolerance,
|
| 949 |
+
"use_relative_tolerance": args.use_relative_tolerance,
|
| 950 |
+
"mismatch_tolerance": args.mismatch_tolerance,
|
| 951 |
+
"stats": stats,
|
| 952 |
+
"final_counts": {
|
| 953 |
+
"memory_storage_cases": count_jsonl_lines(main_memory_dir / "cases.jsonl"),
|
| 954 |
+
"debug_memory_records": count_jsonl_lines(debug_memory_file),
|
| 955 |
+
"debug_case_memory_cases": count_jsonl_lines(debug_case_memory_dir / "cases.jsonl"),
|
| 956 |
+
"debate_memory_cases": count_jsonl_lines(debate_memory_dir / "cases.jsonl"),
|
| 957 |
+
"main_added_persisted": main_added,
|
| 958 |
+
"debug_case_added_persisted": debug_case_added,
|
| 959 |
+
"debate_added_persisted": debate_added,
|
| 960 |
+
},
|
| 961 |
+
}
|
| 962 |
+
dump_json(variant_dir / "import_summary.json", summary)
|
| 963 |
+
|
| 964 |
+
print("=== Import Complete ===")
|
| 965 |
+
print(f"Variant: {variant_dir}")
|
| 966 |
+
print(f"Main memory added: {main_added}")
|
| 967 |
+
print(f"Debug raw added: {len(pending_debug_rows)}")
|
| 968 |
+
print(f"Debug case added: {debug_case_added}")
|
| 969 |
+
print(f"Debate memory added: {debate_added}")
|
| 970 |
+
print(f"Summary: {variant_dir / 'import_summary.json'}")
|
| 971 |
+
|
| 972 |
+
|
| 973 |
+
if __name__ == "__main__":
|
| 974 |
+
main()
|
src/debate_memory/build_memory_from_eval_results.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Build solution memory from evaluation result directories.
|
| 4 |
+
|
| 5 |
+
Any evaluation directory can be used as input as long as it contains both
|
| 6 |
+
`evaluation_results.jsonl` and a `code/` directory. The script extracts problem
|
| 7 |
+
descriptions, executable code, and objective values from correct cases and
|
| 8 |
+
writes them into the solution-memory store.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Dict, List, Optional
|
| 16 |
+
|
| 17 |
+
from .config import find_benchmark_path, get_benchmark_dirs, normalize_dataset_name
|
| 18 |
+
from .memory_bank import MemoryBank
|
| 19 |
+
|
| 20 |
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 21 |
+
PROJECT_ROOT = SCRIPT_DIR.parent.parent
|
| 22 |
+
DEFAULT_BENCHMARKS_DIR = get_benchmark_dirs(PROJECT_ROOT)[0]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_evaluation_results(eval_file: str) -> Dict[int, Dict]:
|
| 26 |
+
"""Load evaluation results as `{id: {...}}`."""
|
| 27 |
+
results = {}
|
| 28 |
+
if not os.path.exists(eval_file):
|
| 29 |
+
print(f"Warning: evaluation result file not found: {eval_file}")
|
| 30 |
+
return results
|
| 31 |
+
|
| 32 |
+
with open(eval_file, 'r', encoding='utf-8') as f:
|
| 33 |
+
for line in f:
|
| 34 |
+
if line.strip():
|
| 35 |
+
data = json.loads(line)
|
| 36 |
+
results[data['id']] = data
|
| 37 |
+
return results
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def load_benchmark_data(benchmark_file: str) -> Dict[int, Dict]:
|
| 41 |
+
"""Load benchmark data as `{id: {...}}`."""
|
| 42 |
+
data = {}
|
| 43 |
+
if not os.path.exists(benchmark_file):
|
| 44 |
+
print(f"Warning: benchmark file not found: {benchmark_file}")
|
| 45 |
+
return data
|
| 46 |
+
|
| 47 |
+
with open(benchmark_file, 'r', encoding='utf-8') as f:
|
| 48 |
+
for idx, line in enumerate(f):
|
| 49 |
+
if line.strip():
|
| 50 |
+
item = json.loads(line)
|
| 51 |
+
# Prefer an explicit id field, otherwise fall back to the line index.
|
| 52 |
+
problem_id = item.get('id', item.get('problem_id', idx))
|
| 53 |
+
data[problem_id] = item
|
| 54 |
+
return data
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def load_solution_code(code_file: str) -> Optional[str]:
|
| 58 |
+
"""Load a solution code file."""
|
| 59 |
+
if not os.path.exists(code_file):
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
with open(code_file, 'r', encoding='utf-8') as f:
|
| 64 |
+
return f.read()
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"Warning: failed to read code file {code_file}: {e}")
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def extract_dataset_name(eval_dir: str) -> Optional[str]:
|
| 71 |
+
"""
|
| 72 |
+
Extract the dataset name from an evaluation directory name.
|
| 73 |
+
|
| 74 |
+
Example:
|
| 75 |
+
`deepseek-chat_EasyLP_clean_eval_20251024_120712.jsonl` -> `EasyLP`
|
| 76 |
+
"""
|
| 77 |
+
dir_name = os.path.basename(eval_dir)
|
| 78 |
+
# Remove the .jsonl suffix if present.
|
| 79 |
+
if dir_name.endswith('.jsonl'):
|
| 80 |
+
dir_name = dir_name[:-6]
|
| 81 |
+
|
| 82 |
+
# Remove the model name and timestamp.
|
| 83 |
+
parts = dir_name.split('_')
|
| 84 |
+
# Locate the `eval` marker.
|
| 85 |
+
try:
|
| 86 |
+
eval_idx = parts.index('eval')
|
| 87 |
+
# The dataset name should appear before `eval`, after the model name.
|
| 88 |
+
dataset_parts = parts[:eval_idx]
|
| 89 |
+
if len(dataset_parts) > 1:
|
| 90 |
+
return normalize_dataset_name('_'.join(dataset_parts[1:]))
|
| 91 |
+
else:
|
| 92 |
+
return normalize_dataset_name(dataset_parts[0]) if dataset_parts else None
|
| 93 |
+
except ValueError:
|
| 94 |
+
# Fallback for names of the form model_dataset_timestamp.
|
| 95 |
+
if len(parts) >= 3:
|
| 96 |
+
return normalize_dataset_name('_'.join(parts[1:-1]))
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def build_memory_from_eval_result(eval_result_dir: str, benchmarks_dir: str, memory_bank: MemoryBank):
|
| 101 |
+
"""
|
| 102 |
+
Build memory from a single evaluation result directory.
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
eval_result_dir: Directory containing `evaluation_results.jsonl` and `code/`.
|
| 106 |
+
benchmarks_dir: Benchmark dataset directory.
|
| 107 |
+
memory_bank: MemoryBank instance.
|
| 108 |
+
"""
|
| 109 |
+
eval_file = os.path.join(eval_result_dir, 'evaluation_results.jsonl')
|
| 110 |
+
code_dir = os.path.join(eval_result_dir, 'code')
|
| 111 |
+
|
| 112 |
+
if not os.path.exists(eval_file):
|
| 113 |
+
print(f"Warning: skipping {eval_result_dir}: evaluation_results.jsonl not found")
|
| 114 |
+
return 0, 0
|
| 115 |
+
|
| 116 |
+
# Extract the dataset name.
|
| 117 |
+
dataset_name = extract_dataset_name(eval_result_dir)
|
| 118 |
+
if not dataset_name:
|
| 119 |
+
print(f"Warning: skipping {eval_result_dir}: failed to extract dataset name")
|
| 120 |
+
return 0, 0
|
| 121 |
+
|
| 122 |
+
benchmark_file = os.path.join(benchmarks_dir, f"{dataset_name}.jsonl")
|
| 123 |
+
if not os.path.exists(benchmark_file):
|
| 124 |
+
try:
|
| 125 |
+
benchmark_file = str(find_benchmark_path(PROJECT_ROOT, dataset_name))
|
| 126 |
+
except FileNotFoundError:
|
| 127 |
+
pass
|
| 128 |
+
if not os.path.exists(benchmark_file):
|
| 129 |
+
print(f"Warning: skipping {eval_result_dir}: benchmark file not found: {benchmark_file}")
|
| 130 |
+
return 0, 0
|
| 131 |
+
|
| 132 |
+
print(f"Processing dataset: {dataset_name}")
|
| 133 |
+
print(f" evaluation results: {eval_file}")
|
| 134 |
+
print(f" benchmark file: {benchmark_file}")
|
| 135 |
+
print(f" code directory: {code_dir}")
|
| 136 |
+
|
| 137 |
+
# Load all required inputs.
|
| 138 |
+
eval_results = load_evaluation_results(eval_file)
|
| 139 |
+
benchmark_data = load_benchmark_data(benchmark_file)
|
| 140 |
+
|
| 141 |
+
added_count = 0
|
| 142 |
+
skipped_count = 0
|
| 143 |
+
|
| 144 |
+
# Process each correct case.
|
| 145 |
+
for problem_id, eval_result in eval_results.items():
|
| 146 |
+
# Only keep correct cases.
|
| 147 |
+
if not eval_result.get('is_correct', False):
|
| 148 |
+
skipped_count += 1
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
# Recover the problem description.
|
| 152 |
+
if problem_id not in benchmark_data:
|
| 153 |
+
print(f" Warning: skipping ID {problem_id}: missing from benchmark file")
|
| 154 |
+
skipped_count += 1
|
| 155 |
+
continue
|
| 156 |
+
|
| 157 |
+
benchmark_item = benchmark_data[problem_id]
|
| 158 |
+
# Support both `description` and `en_question`.
|
| 159 |
+
description = benchmark_item.get('description', '') or benchmark_item.get('en_question', '')
|
| 160 |
+
|
| 161 |
+
if not description:
|
| 162 |
+
print(f" Warning: skipping ID {problem_id}: missing problem description")
|
| 163 |
+
skipped_count += 1
|
| 164 |
+
continue
|
| 165 |
+
|
| 166 |
+
# Load the solution code.
|
| 167 |
+
code_file = os.path.join(code_dir, f"problem_{problem_id}.py")
|
| 168 |
+
solution_code = load_solution_code(code_file)
|
| 169 |
+
|
| 170 |
+
if not solution_code:
|
| 171 |
+
print(f" Warning: skipping ID {problem_id}: code file missing or unreadable")
|
| 172 |
+
skipped_count += 1
|
| 173 |
+
continue
|
| 174 |
+
|
| 175 |
+
# Recover the objective value.
|
| 176 |
+
objective_value = eval_result.get('predicted_objective')
|
| 177 |
+
if objective_value is None:
|
| 178 |
+
# Fall back to the benchmark answer fields if needed.
|
| 179 |
+
answer_str = benchmark_item.get('answer', '') or benchmark_item.get('en_answer', '')
|
| 180 |
+
try:
|
| 181 |
+
objective_value = float(answer_str)
|
| 182 |
+
except:
|
| 183 |
+
print(f" Warning: skipping ID {problem_id}: objective value unavailable")
|
| 184 |
+
skipped_count += 1
|
| 185 |
+
continue
|
| 186 |
+
|
| 187 |
+
# Build metadata for the stored case.
|
| 188 |
+
ground_truth = benchmark_item.get('answer', '') or benchmark_item.get('en_answer', '')
|
| 189 |
+
metadata = {
|
| 190 |
+
'source': 'eval_results',
|
| 191 |
+
'dataset': dataset_name,
|
| 192 |
+
'eval_dir': os.path.basename(eval_result_dir),
|
| 193 |
+
'execution_status': eval_result.get('execution_status', 'unknown'),
|
| 194 |
+
'ground_truth': ground_truth,
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
# Do not deduplicate across datasets; the same problem_id may appear in multiple benchmarks.
|
| 198 |
+
|
| 199 |
+
# Add the case to the memory bank.
|
| 200 |
+
try:
|
| 201 |
+
memory_bank.add_case(
|
| 202 |
+
problem_id=problem_id,
|
| 203 |
+
problem_desc=description,
|
| 204 |
+
solution_code=solution_code,
|
| 205 |
+
objective_value=float(objective_value),
|
| 206 |
+
is_correct=True,
|
| 207 |
+
metadata=metadata
|
| 208 |
+
)
|
| 209 |
+
added_count += 1
|
| 210 |
+
except Exception as e:
|
| 211 |
+
print(f" Error: failed to add ID {problem_id}: {e}")
|
| 212 |
+
skipped_count += 1
|
| 213 |
+
|
| 214 |
+
print(f" added cases: {added_count}")
|
| 215 |
+
print(f" skipped cases: {skipped_count}")
|
| 216 |
+
print()
|
| 217 |
+
|
| 218 |
+
return added_count, skipped_count
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def main():
|
| 222 |
+
import argparse
|
| 223 |
+
|
| 224 |
+
parser = argparse.ArgumentParser(description="Build a memory bank from evaluation results")
|
| 225 |
+
parser.add_argument('--eval_dirs', type=str, nargs='+', required=True,
|
| 226 |
+
help='Evaluation result directories containing evaluation_results.jsonl and code/')
|
| 227 |
+
parser.add_argument('--benchmarks_dir', type=str,
|
| 228 |
+
default=str(DEFAULT_BENCHMARKS_DIR),
|
| 229 |
+
help='Benchmark dataset directory')
|
| 230 |
+
parser.add_argument('--memory_dir', type=str,
|
| 231 |
+
default=str(PROJECT_ROOT / "memory_storage"),
|
| 232 |
+
help='Memory storage directory')
|
| 233 |
+
parser.add_argument('--clear', action='store_true',
|
| 234 |
+
help='Clear the existing memory store before building')
|
| 235 |
+
|
| 236 |
+
args = parser.parse_args()
|
| 237 |
+
|
| 238 |
+
# Validate input directories.
|
| 239 |
+
if not os.path.exists(args.benchmarks_dir):
|
| 240 |
+
print(f"Error: benchmark directory does not exist: {args.benchmarks_dir}")
|
| 241 |
+
sys.exit(1)
|
| 242 |
+
|
| 243 |
+
# Clear the memory store if requested.
|
| 244 |
+
if args.clear:
|
| 245 |
+
if os.path.exists(args.memory_dir):
|
| 246 |
+
import shutil
|
| 247 |
+
print(f"Clearing existing memory store: {args.memory_dir}")
|
| 248 |
+
shutil.rmtree(args.memory_dir)
|
| 249 |
+
print()
|
| 250 |
+
|
| 251 |
+
# Initialize the memory bank.
|
| 252 |
+
print("="*70)
|
| 253 |
+
print("Building Memory Bank from Evaluation Results")
|
| 254 |
+
print("="*70)
|
| 255 |
+
print()
|
| 256 |
+
|
| 257 |
+
memory_bank = MemoryBank(memory_dir=args.memory_dir)
|
| 258 |
+
print(f"Current memory size: {memory_bank.case_count} cases")
|
| 259 |
+
print()
|
| 260 |
+
|
| 261 |
+
# Process each evaluation directory.
|
| 262 |
+
total_added = 0
|
| 263 |
+
total_skipped = 0
|
| 264 |
+
|
| 265 |
+
for eval_dir in args.eval_dirs:
|
| 266 |
+
if not os.path.exists(eval_dir):
|
| 267 |
+
print(f"Warning: skipping missing directory: {eval_dir}")
|
| 268 |
+
continue
|
| 269 |
+
|
| 270 |
+
added, skipped = build_memory_from_eval_result(
|
| 271 |
+
eval_dir, args.benchmarks_dir, memory_bank
|
| 272 |
+
)
|
| 273 |
+
total_added += added
|
| 274 |
+
total_skipped += skipped
|
| 275 |
+
|
| 276 |
+
# Refresh the case count.
|
| 277 |
+
memory_bank.case_count = memory_bank._count_cases()
|
| 278 |
+
|
| 279 |
+
print("="*70)
|
| 280 |
+
print("Memory Bank Build Complete")
|
| 281 |
+
print("="*70)
|
| 282 |
+
print(f"Total added: {total_added} cases")
|
| 283 |
+
print(f"Total skipped: {total_skipped} cases")
|
| 284 |
+
print(f"Final memory size: {memory_bank.case_count} cases")
|
| 285 |
+
print()
|
| 286 |
+
print(f"Memory location: {args.memory_dir}")
|
| 287 |
+
print(f" - cases.jsonl: {os.path.join(args.memory_dir, 'cases.jsonl')}")
|
| 288 |
+
print(f" - index/: {os.path.join(args.memory_dir, 'index')}")
|
| 289 |
+
print("="*70)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
if __name__ == "__main__":
|
| 293 |
+
main()
|
src/debate_memory/config.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration file for simple RAG evaluation
|
| 3 |
+
Contains prompt templates and other settings
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# ============================================
|
| 9 |
+
# Prompt Templates
|
| 10 |
+
# ============================================
|
| 11 |
+
|
| 12 |
+
# Default Gurobi prompt template
|
| 13 |
+
GUROBI_PROMPT = {
|
| 14 |
+
"system": """You are a helpful Assistant with expertise in mathematical modeling and the Gurobi solver. When the User provides an OR question, you will analyze it, build a detailed mathematical model, and provide the Gurobi code to solve it.
|
| 15 |
+
|
| 16 |
+
Your response should follow these steps:
|
| 17 |
+
1. Carefully analyze the problem to identify decision variables, objective, and constraints.
|
| 18 |
+
|
| 19 |
+
2. Develop a complete mathematical model, explicitly defining:
|
| 20 |
+
- Sets
|
| 21 |
+
- Parameters
|
| 22 |
+
- Decision Variables (and their types)
|
| 23 |
+
- Objective Function
|
| 24 |
+
- Constraints
|
| 25 |
+
3. Provide the corresponding Gurobi Python code to implement the model.
|
| 26 |
+
|
| 27 |
+
Implementation guardrails:
|
| 28 |
+
- Use `gurobipy` exclusively (avoid cvxpy/pulp/copty imports).
|
| 29 |
+
- When indexing tupledict variables across periods, introduce an explicit sentinel index (e.g., period 0) for initial conditions instead of accessing undefined keys like `x[-1]`.
|
| 30 |
+
- Define any Big-M constants explicitly using bounds derived from the data before they appear in constraints.
|
| 31 |
+
- Keep the model linear/integer; if a relationship seems non-linear, introduce auxiliary variables and linearization rather than exponentiation or log constraints.
|
| 32 |
+
- Always ensure every symbol referenced in constraints/objective (such as `M`, helper dictionaries, etc.) is declared in the code block.
|
| 33 |
+
""",
|
| 34 |
+
"user": """Problem:
|
| 35 |
+
{question}
|
| 36 |
+
|
| 37 |
+
Provide a complete solution with mathematical model and Gurobi code.
|
| 38 |
+
"""
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# ============================================
|
| 42 |
+
# Model Configuration
|
| 43 |
+
# ============================================
|
| 44 |
+
|
| 45 |
+
# Supported models and their default temperatures
|
| 46 |
+
MODEL_CONFIGS = {
|
| 47 |
+
"gpt-4o": {"temperature": 0.01, "max_tokens": 8192},
|
| 48 |
+
"gpt-4o-mini": {"temperature": 0.01, "max_tokens": 8192},
|
| 49 |
+
"deepseek-chat": {"temperature": 0.01, "max_tokens": 8192},
|
| 50 |
+
"gemini-2.0-flash-exp": {"temperature": 0.01, "max_tokens": 8192},
|
| 51 |
+
"gemini-2.5-pro": {"temperature": 0.01, "max_tokens": 8192},
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# ============================================
|
| 55 |
+
# Evaluation Configuration
|
| 56 |
+
# ============================================
|
| 57 |
+
|
| 58 |
+
EVAL_CONFIG = {
|
| 59 |
+
# Execution settings
|
| 60 |
+
"timeout": 60, # seconds
|
| 61 |
+
"max_retries": 3,
|
| 62 |
+
|
| 63 |
+
# Answer comparison settings
|
| 64 |
+
"tolerance": 0.05, # 5% relative tolerance by default
|
| 65 |
+
"use_relative_tolerance": True,
|
| 66 |
+
"absolute_tolerance": 1e-3, # for zero objective values
|
| 67 |
+
|
| 68 |
+
# Output settings
|
| 69 |
+
"save_code": True,
|
| 70 |
+
"save_output": False, # whether to save stdout/stderr
|
| 71 |
+
"verbose": False,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# ============================================
|
| 75 |
+
# Dataset Configuration
|
| 76 |
+
# ============================================
|
| 77 |
+
|
| 78 |
+
# Supported datasets
|
| 79 |
+
DATASETS = [
|
| 80 |
+
"ComplexLP",
|
| 81 |
+
"EasyLP",
|
| 82 |
+
"IndustryOR",
|
| 83 |
+
"NL4OPT",
|
| 84 |
+
"NLP4LP",
|
| 85 |
+
"ReSocratic",
|
| 86 |
+
"ComplexOR",
|
| 87 |
+
"OPT-Principled",
|
| 88 |
+
]
|
| 89 |
+
|
| 90 |
+
DATASET_ALIASES = {
|
| 91 |
+
"complexlp_clean": "ComplexLP",
|
| 92 |
+
"easylp_clean": "EasyLP",
|
| 93 |
+
"industryor_clean": "IndustryOR",
|
| 94 |
+
"industryor_v2": "IndustryOR",
|
| 95 |
+
"industryor_fixedv2": "IndustryOR",
|
| 96 |
+
"industryor_fixedv2_clean": "IndustryOR",
|
| 97 |
+
"nl4opt": "NL4OPT",
|
| 98 |
+
"nl4opt_clean": "NL4OPT",
|
| 99 |
+
"nlp4lp_clean": "NLP4LP",
|
| 100 |
+
"complexor_clean": "ComplexOR",
|
| 101 |
+
"resocratic_clean": "ReSocratic",
|
| 102 |
+
"combined": "OPT-Principled",
|
| 103 |
+
"combined_dataset": "OPT-Principled",
|
| 104 |
+
"opt-principled_clean": "OPT-Principled",
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# Dataset-specific settings (if needed)
|
| 108 |
+
DATASET_CONFIG = {
|
| 109 |
+
"ComplexLP": {"tolerance": 0.05},
|
| 110 |
+
"EasyLP": {"tolerance": 0.01},
|
| 111 |
+
"IndustryOR": {"tolerance": 0.05},
|
| 112 |
+
"OPT-Principled": {"tolerance": 0.05},
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
# ============================================
|
| 116 |
+
# Utility Functions
|
| 117 |
+
# ============================================
|
| 118 |
+
|
| 119 |
+
def get_prompt_template(template_name="default"):
|
| 120 |
+
"""Get prompt template by name"""
|
| 121 |
+
templates = {
|
| 122 |
+
"default": GUROBI_PROMPT,
|
| 123 |
+
}
|
| 124 |
+
return templates.get(template_name, GUROBI_PROMPT)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def get_model_config(model_name):
|
| 128 |
+
"""Get configuration for a specific model"""
|
| 129 |
+
return MODEL_CONFIGS.get(model_name, {"temperature": 0.01, "max_tokens": 8192})
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def get_dataset_config(dataset_name):
|
| 133 |
+
"""Get configuration for a specific dataset"""
|
| 134 |
+
return DATASET_CONFIG.get(normalize_dataset_name(dataset_name), {"tolerance": 0.05})
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def normalize_dataset_name(dataset_name: str) -> str:
|
| 138 |
+
"""Map historical dataset names to the canonical OPEN benchmark names."""
|
| 139 |
+
if not dataset_name:
|
| 140 |
+
return dataset_name
|
| 141 |
+
|
| 142 |
+
name = dataset_name.strip()
|
| 143 |
+
if name.endswith(".jsonl"):
|
| 144 |
+
name = name[:-6]
|
| 145 |
+
|
| 146 |
+
alias = DATASET_ALIASES.get(name.casefold())
|
| 147 |
+
if alias:
|
| 148 |
+
return alias
|
| 149 |
+
|
| 150 |
+
for canonical_name in DATASETS:
|
| 151 |
+
if canonical_name.casefold() == name.casefold():
|
| 152 |
+
return canonical_name
|
| 153 |
+
|
| 154 |
+
if name.endswith("_clean"):
|
| 155 |
+
base_name = name[:-6]
|
| 156 |
+
for canonical_name in DATASETS:
|
| 157 |
+
if canonical_name.casefold() == base_name.casefold():
|
| 158 |
+
return canonical_name
|
| 159 |
+
|
| 160 |
+
return name
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def get_benchmark_dirs(project_root: Path) -> list[Path]:
|
| 164 |
+
"""Return benchmark directories in priority order for the migrated OPEN layout."""
|
| 165 |
+
return [
|
| 166 |
+
project_root.parent.parent / "data" / "benchmarks",
|
| 167 |
+
project_root / "clean_benchmarks",
|
| 168 |
+
project_root.parent / "clean_benchmarks",
|
| 169 |
+
]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def find_benchmark_path(project_root: Path, dataset_name: str) -> Path:
|
| 173 |
+
"""Locate the benchmark file for a dataset, accepting legacy names as input."""
|
| 174 |
+
normalized_name = normalize_dataset_name(dataset_name)
|
| 175 |
+
candidate_names = [normalized_name]
|
| 176 |
+
raw_name = dataset_name[:-6] if dataset_name.endswith(".jsonl") else dataset_name
|
| 177 |
+
if raw_name not in candidate_names:
|
| 178 |
+
candidate_names.append(raw_name)
|
| 179 |
+
|
| 180 |
+
for directory in get_benchmark_dirs(project_root):
|
| 181 |
+
for name in candidate_names:
|
| 182 |
+
candidate = directory / f"{name}.jsonl"
|
| 183 |
+
if candidate.exists():
|
| 184 |
+
return candidate
|
| 185 |
+
|
| 186 |
+
raise FileNotFoundError(
|
| 187 |
+
f"Dataset '{dataset_name}' not found. Checked directories: "
|
| 188 |
+
f"{[str(path) for path in get_benchmark_dirs(project_root)]}"
|
| 189 |
+
)
|
src/debate_memory/debate_memory_builder.py
ADDED
|
@@ -0,0 +1,477 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Build a debate-specific memory bank from historical debate runs.
|
| 3 |
+
|
| 4 |
+
This scans existing debate result directories such as
|
| 5 |
+
`./results/Agora-Opt/debate/<dataset>/<timestamp>_<modelA>_vs_<modelB>/`
|
| 6 |
+
directories, identifies problems where the two single generations disagreed yet
|
| 7 |
+
the debate converged to a correct consensus, summarizes the key reconciliation
|
| 8 |
+
insights (optionally via an LLM), and stores the cases inside a dedicated
|
| 9 |
+
`MemoryBank` directory (default: ./debate_memory_storage).
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import json
|
| 16 |
+
import math
|
| 17 |
+
import os
|
| 18 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 19 |
+
from dataclasses import dataclass
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Dict, Iterable, List, Optional, Tuple
|
| 22 |
+
|
| 23 |
+
from tqdm import tqdm
|
| 24 |
+
|
| 25 |
+
from .llm import get_response
|
| 26 |
+
from .memory_bank import MemoryBank
|
| 27 |
+
|
| 28 |
+
PKG_DIR = Path(__file__).resolve().parent
|
| 29 |
+
PROJECT_ROOT = PKG_DIR.parent.parent
|
| 30 |
+
DEFAULT_RUNS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt" / "debate"
|
| 31 |
+
DEFAULT_DEBATE_MEMORY_DIR = PROJECT_ROOT / "debate_memory_storage"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass
|
| 35 |
+
class DebateCaseInput:
|
| 36 |
+
dataset: str
|
| 37 |
+
problem_id: int
|
| 38 |
+
description: str
|
| 39 |
+
final_code: str
|
| 40 |
+
final_result: Optional[float]
|
| 41 |
+
debate_rounds: List[Dict]
|
| 42 |
+
modelA: str
|
| 43 |
+
modelB: str
|
| 44 |
+
run_dir: Path
|
| 45 |
+
ground_truth: Optional[str]
|
| 46 |
+
initial_A_result: Optional[float]
|
| 47 |
+
initial_B_result: Optional[float]
|
| 48 |
+
evaluation: Dict
|
| 49 |
+
metadata: Dict
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def load_jsonl(path: Path) -> List[Dict]:
|
| 53 |
+
if not path.exists():
|
| 54 |
+
return []
|
| 55 |
+
data: List[Dict] = []
|
| 56 |
+
with path.open("r", encoding="utf-8") as fh:
|
| 57 |
+
for line in fh:
|
| 58 |
+
line = line.strip()
|
| 59 |
+
if not line:
|
| 60 |
+
continue
|
| 61 |
+
try:
|
| 62 |
+
data.append(json.loads(line))
|
| 63 |
+
except json.JSONDecodeError:
|
| 64 |
+
continue
|
| 65 |
+
return data
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def float_or_none(value) -> Optional[float]:
|
| 69 |
+
if value is None:
|
| 70 |
+
return None
|
| 71 |
+
try:
|
| 72 |
+
return float(value)
|
| 73 |
+
except (ValueError, TypeError):
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def has_disagreement(entry: Dict, tolerance: float) -> bool:
|
| 78 |
+
a = float_or_none(entry.get("initial_A_result"))
|
| 79 |
+
b = float_or_none(entry.get("initial_B_result"))
|
| 80 |
+
if a is None or b is None:
|
| 81 |
+
return True
|
| 82 |
+
return abs(a - b) > tolerance
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def summarize_rounds(rounds: List[Dict], max_chars: int = 2000) -> str:
|
| 86 |
+
if not rounds:
|
| 87 |
+
return ""
|
| 88 |
+
lines: List[str] = []
|
| 89 |
+
for rnd in rounds:
|
| 90 |
+
round_idx = rnd.get("round")
|
| 91 |
+
res_a = rnd.get("result_A")
|
| 92 |
+
res_b = rnd.get("result_B")
|
| 93 |
+
status_a = rnd.get("status_A")
|
| 94 |
+
status_b = rnd.get("status_B")
|
| 95 |
+
analysis_a = (rnd.get("analysis_A") or "").strip()
|
| 96 |
+
analysis_b = (rnd.get("analysis_B") or "").strip()
|
| 97 |
+
lines.append(
|
| 98 |
+
f"Round {round_idx}: A={res_a} ({status_a}), B={res_b} ({status_b})"
|
| 99 |
+
)
|
| 100 |
+
if analysis_a:
|
| 101 |
+
lines.append(f"Model A analysis:\n{analysis_a}")
|
| 102 |
+
if analysis_b:
|
| 103 |
+
lines.append(f"Model B analysis:\n{analysis_b}")
|
| 104 |
+
lines.append("")
|
| 105 |
+
text = "\n".join(lines).strip()
|
| 106 |
+
if len(text) <= max_chars:
|
| 107 |
+
return text
|
| 108 |
+
return text[: max_chars - 200] + "\n...\n(truncated)"
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def build_summary_payload(
|
| 112 |
+
case: DebateCaseInput,
|
| 113 |
+
llm_model: Optional[str],
|
| 114 |
+
temperature: float,
|
| 115 |
+
llm_attempts: int = 1,
|
| 116 |
+
) -> Dict:
|
| 117 |
+
history_text = summarize_rounds(case.debate_rounds)
|
| 118 |
+
default_summary = {
|
| 119 |
+
"summary": (
|
| 120 |
+
f"Initial mismatch: modelA={case.initial_A_result}, "
|
| 121 |
+
f"modelB={case.initial_B_result}. "
|
| 122 |
+
f"Debate converged in {len(case.debate_rounds)} rounds."
|
| 123 |
+
),
|
| 124 |
+
"mismatch_reason": "",
|
| 125 |
+
"decisive_argument": "",
|
| 126 |
+
"guardrails": [],
|
| 127 |
+
"modeling_patterns": [],
|
| 128 |
+
}
|
| 129 |
+
if not llm_model:
|
| 130 |
+
return default_summary | {"history_excerpt": history_text}
|
| 131 |
+
|
| 132 |
+
prompt = f"""
|
| 133 |
+
You are helping an optimisation-debate memory builder.
|
| 134 |
+
|
| 135 |
+
Problem description:
|
| 136 |
+
{case.description}
|
| 137 |
+
|
| 138 |
+
Initial disagreement:
|
| 139 |
+
- Model A result: {case.initial_A_result}
|
| 140 |
+
- Model B result: {case.initial_B_result}
|
| 141 |
+
- Ground truth (if known): {case.ground_truth}
|
| 142 |
+
|
| 143 |
+
Debate transcript:
|
| 144 |
+
{history_text}
|
| 145 |
+
|
| 146 |
+
Final consensus objective: {case.final_result}
|
| 147 |
+
|
| 148 |
+
Please return a JSON object with the following keys:
|
| 149 |
+
- "summary": 2-3 sentences explaining how the debate resolved the mismatch.
|
| 150 |
+
- "mismatch_reason": concise reason for the disagreement.
|
| 151 |
+
- "decisive_argument": specific insight that convinced both sides.
|
| 152 |
+
- "guardrails": list of actionable bullet points the next debater should follow.
|
| 153 |
+
- "modeling_patterns": list of reusable modeling tricks/structures that appeared.
|
| 154 |
+
|
| 155 |
+
JSON ONLY. No prose outside the JSON.
|
| 156 |
+
""".strip()
|
| 157 |
+
|
| 158 |
+
attempts_remaining = max(1, llm_attempts)
|
| 159 |
+
last_error: Optional[Exception] = None
|
| 160 |
+
while attempts_remaining > 0:
|
| 161 |
+
try:
|
| 162 |
+
response = get_response(
|
| 163 |
+
prompt,
|
| 164 |
+
model=llm_model,
|
| 165 |
+
temperature=temperature,
|
| 166 |
+
maximum_retries=1,
|
| 167 |
+
)
|
| 168 |
+
payload = json.loads(response)
|
| 169 |
+
payload["history_excerpt"] = history_text
|
| 170 |
+
return payload
|
| 171 |
+
except Exception as exc: # noqa: BLE001
|
| 172 |
+
last_error = exc
|
| 173 |
+
attempts_remaining -= 1
|
| 174 |
+
|
| 175 |
+
fallback = default_summary.copy()
|
| 176 |
+
failure_reason = f"{last_error}" if last_error else "LLM call failed"
|
| 177 |
+
fallback["summary"] += f" LLM summary failed: {failure_reason}"
|
| 178 |
+
fallback["history_excerpt"] = history_text
|
| 179 |
+
return fallback
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def existing_signatures(memory_dir: Path) -> set[str]:
|
| 183 |
+
cases_path = memory_dir / "cases.jsonl"
|
| 184 |
+
if not cases_path.exists():
|
| 185 |
+
return set()
|
| 186 |
+
signs: set[str] = set()
|
| 187 |
+
with cases_path.open("r", encoding="utf-8") as fh:
|
| 188 |
+
for line in fh:
|
| 189 |
+
line = line.strip()
|
| 190 |
+
if not line:
|
| 191 |
+
continue
|
| 192 |
+
try:
|
| 193 |
+
data = json.loads(line)
|
| 194 |
+
except json.JSONDecodeError:
|
| 195 |
+
continue
|
| 196 |
+
meta = data.get("metadata") or {}
|
| 197 |
+
sig = meta.get("debate_signature")
|
| 198 |
+
if sig:
|
| 199 |
+
signs.add(sig)
|
| 200 |
+
return signs
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
class DebateMemoryBuilder:
|
| 204 |
+
def __init__(
|
| 205 |
+
self,
|
| 206 |
+
runs_root: Path,
|
| 207 |
+
memory_dir: Path,
|
| 208 |
+
mismatch_tolerance: float,
|
| 209 |
+
llm_model: Optional[str],
|
| 210 |
+
temperature: float,
|
| 211 |
+
llm_attempts: int,
|
| 212 |
+
max_workers: int,
|
| 213 |
+
datasets: Optional[Iterable[str]] = None,
|
| 214 |
+
dry_run: bool = False,
|
| 215 |
+
) -> None:
|
| 216 |
+
self.runs_root = runs_root
|
| 217 |
+
self.memory_dir = memory_dir
|
| 218 |
+
self.mismatch_tolerance = mismatch_tolerance
|
| 219 |
+
self.llm_model = llm_model
|
| 220 |
+
self.temperature = temperature
|
| 221 |
+
self.llm_attempts = max(1, llm_attempts)
|
| 222 |
+
self.max_workers = max_workers
|
| 223 |
+
self.datasets_filter = {d.lower() for d in datasets} if datasets else None
|
| 224 |
+
self.dry_run = dry_run
|
| 225 |
+
|
| 226 |
+
def build(self) -> None:
|
| 227 |
+
candidates = self._collect_candidates()
|
| 228 |
+
if not candidates:
|
| 229 |
+
print("No qualifying debate cases found.")
|
| 230 |
+
return
|
| 231 |
+
|
| 232 |
+
if not self.memory_dir.exists() and not self.dry_run:
|
| 233 |
+
self.memory_dir.mkdir(parents=True, exist_ok=True)
|
| 234 |
+
|
| 235 |
+
seen_sigs = existing_signatures(self.memory_dir)
|
| 236 |
+
|
| 237 |
+
bank = None if self.dry_run else MemoryBank(memory_dir=str(self.memory_dir))
|
| 238 |
+
|
| 239 |
+
added = 0
|
| 240 |
+
skipped_duplicates = 0
|
| 241 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
| 242 |
+
futures = {
|
| 243 |
+
executor.submit(self._summarize_case, case): case
|
| 244 |
+
for case in candidates
|
| 245 |
+
}
|
| 246 |
+
for future in tqdm(as_completed(futures), total=len(futures), desc="Summarizing"):
|
| 247 |
+
case = futures[future]
|
| 248 |
+
signature = f"{case.dataset}:{case.problem_id}:{case.run_dir.name}"
|
| 249 |
+
if signature in seen_sigs:
|
| 250 |
+
skipped_duplicates += 1
|
| 251 |
+
continue
|
| 252 |
+
summary_payload = future.result()
|
| 253 |
+
description = (
|
| 254 |
+
f"{case.description.strip()}\n\n"
|
| 255 |
+
f"# Debate Memory Summary\n"
|
| 256 |
+
f"{summary_payload.get('summary', '').strip()}"
|
| 257 |
+
).strip()
|
| 258 |
+
metadata = {
|
| 259 |
+
"source": "debate_memory_builder",
|
| 260 |
+
"dataset": case.dataset,
|
| 261 |
+
"run_dir": str(case.run_dir),
|
| 262 |
+
"modelA": case.modelA,
|
| 263 |
+
"modelB": case.modelB,
|
| 264 |
+
"initial_A_result": case.initial_A_result,
|
| 265 |
+
"initial_B_result": case.initial_B_result,
|
| 266 |
+
"ground_truth": case.ground_truth,
|
| 267 |
+
"debate_signature": signature,
|
| 268 |
+
"summary": summary_payload,
|
| 269 |
+
}
|
| 270 |
+
if self.dry_run:
|
| 271 |
+
added += 1
|
| 272 |
+
continue
|
| 273 |
+
try:
|
| 274 |
+
bank.add_case(
|
| 275 |
+
problem_id=case.problem_id,
|
| 276 |
+
problem_desc=description,
|
| 277 |
+
solution_code=case.final_code,
|
| 278 |
+
objective_value=case.final_result or 0.0,
|
| 279 |
+
is_correct=True,
|
| 280 |
+
metadata=metadata,
|
| 281 |
+
)
|
| 282 |
+
added += 1
|
| 283 |
+
seen_sigs.add(signature)
|
| 284 |
+
except Exception as exc: # noqa: BLE001
|
| 285 |
+
print(f"Failed to add case {signature}: {exc}")
|
| 286 |
+
|
| 287 |
+
print("===== Debate Memory Builder Summary =====")
|
| 288 |
+
print(f"Runs root: {self.runs_root}")
|
| 289 |
+
print(f"Output dir: {self.memory_dir}")
|
| 290 |
+
print(f"Total candidates: {len(candidates)}")
|
| 291 |
+
print(f"Added cases: {added}")
|
| 292 |
+
print(f"Duplicates skipped: {skipped_duplicates}")
|
| 293 |
+
if self.dry_run:
|
| 294 |
+
print("Dry-run mode: no cases were written.")
|
| 295 |
+
|
| 296 |
+
def _collect_candidates(self) -> List[DebateCaseInput]:
|
| 297 |
+
candidates: List[DebateCaseInput] = []
|
| 298 |
+
if not self.runs_root.exists():
|
| 299 |
+
print(f"Runs root not found: {self.runs_root}")
|
| 300 |
+
return candidates
|
| 301 |
+
|
| 302 |
+
for dataset_dir in sorted(self.runs_root.iterdir()):
|
| 303 |
+
if not dataset_dir.is_dir():
|
| 304 |
+
continue
|
| 305 |
+
dataset_name = dataset_dir.name
|
| 306 |
+
if self.datasets_filter and dataset_name.lower() not in self.datasets_filter:
|
| 307 |
+
continue
|
| 308 |
+
for run_dir in sorted(dataset_dir.iterdir()):
|
| 309 |
+
if not run_dir.is_dir():
|
| 310 |
+
continue
|
| 311 |
+
dataset_candidates = self._parse_run(dataset_name, run_dir)
|
| 312 |
+
candidates.extend(dataset_candidates)
|
| 313 |
+
return candidates
|
| 314 |
+
|
| 315 |
+
def _parse_run(self, dataset: str, run_dir: Path) -> List[DebateCaseInput]:
|
| 316 |
+
results_path = run_dir / "debate_results.jsonl"
|
| 317 |
+
if not results_path.exists():
|
| 318 |
+
return []
|
| 319 |
+
|
| 320 |
+
modelA, modelB = self._infer_models(run_dir.name)
|
| 321 |
+
consensus_path = next(run_dir.glob("consensus_*_vs_*.jsonl"), None)
|
| 322 |
+
consensus_records = load_jsonl(consensus_path) if consensus_path else []
|
| 323 |
+
desc_map = {int(rec["id"]): rec for rec in consensus_records if "id" in rec}
|
| 324 |
+
|
| 325 |
+
eval_path = run_dir / "eval_consensus" / "evaluation_results.jsonl"
|
| 326 |
+
evaluation_map = {
|
| 327 |
+
int(rec["id"]): rec for rec in load_jsonl(eval_path) if "id" in rec
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
run_candidates: List[DebateCaseInput] = []
|
| 331 |
+
for entry in load_jsonl(results_path):
|
| 332 |
+
problem_id = entry.get("problem_id")
|
| 333 |
+
if problem_id is None:
|
| 334 |
+
continue
|
| 335 |
+
problem_id = int(problem_id)
|
| 336 |
+
if not has_disagreement(entry, self.mismatch_tolerance):
|
| 337 |
+
continue
|
| 338 |
+
if not entry.get("converged"):
|
| 339 |
+
continue
|
| 340 |
+
evaluation = evaluation_map.get(problem_id)
|
| 341 |
+
desc_entry = desc_map.get(problem_id)
|
| 342 |
+
if desc_entry:
|
| 343 |
+
description = desc_entry.get("description") or f"{dataset} problem {problem_id}"
|
| 344 |
+
else:
|
| 345 |
+
description = f"Dataset {dataset} problem {problem_id}"
|
| 346 |
+
final_code = entry.get("final_code") or (
|
| 347 |
+
desc_entry.get("generated_code", "") if desc_entry else ""
|
| 348 |
+
)
|
| 349 |
+
if not final_code:
|
| 350 |
+
continue
|
| 351 |
+
debate_rounds = entry.get("debate_rounds") or []
|
| 352 |
+
if not debate_rounds:
|
| 353 |
+
continue
|
| 354 |
+
run_candidates.append(
|
| 355 |
+
DebateCaseInput(
|
| 356 |
+
dataset=dataset,
|
| 357 |
+
problem_id=problem_id,
|
| 358 |
+
description=description,
|
| 359 |
+
final_code=final_code,
|
| 360 |
+
final_result=float_or_none(entry.get("final_result")),
|
| 361 |
+
debate_rounds=debate_rounds,
|
| 362 |
+
modelA=modelA,
|
| 363 |
+
modelB=modelB,
|
| 364 |
+
run_dir=run_dir,
|
| 365 |
+
ground_truth=entry.get("ground_truth"),
|
| 366 |
+
initial_A_result=float_or_none(entry.get("initial_A_result")),
|
| 367 |
+
initial_B_result=float_or_none(entry.get("initial_B_result")),
|
| 368 |
+
evaluation=evaluation or {},
|
| 369 |
+
metadata={
|
| 370 |
+
"run_dir": str(run_dir),
|
| 371 |
+
"dataset": dataset,
|
| 372 |
+
},
|
| 373 |
+
)
|
| 374 |
+
)
|
| 375 |
+
return run_candidates
|
| 376 |
+
|
| 377 |
+
@staticmethod
|
| 378 |
+
def _infer_models(run_name: str) -> Tuple[str, str]:
|
| 379 |
+
"""
|
| 380 |
+
Run folder format: <timestamp>_<modelA>_vs_<modelB>
|
| 381 |
+
"""
|
| 382 |
+
parts = run_name.split("_vs_")
|
| 383 |
+
if len(parts) != 2:
|
| 384 |
+
return "modelA", "modelB"
|
| 385 |
+
left = parts[0].split("_") # timestamp + modelA pieces
|
| 386 |
+
if len(left) < 2:
|
| 387 |
+
return left[-1], parts[1]
|
| 388 |
+
modelA = "_".join(left[1:])
|
| 389 |
+
modelB = parts[1]
|
| 390 |
+
return modelA, modelB
|
| 391 |
+
|
| 392 |
+
def _summarize_case(self, case: DebateCaseInput) -> Dict:
|
| 393 |
+
return build_summary_payload(
|
| 394 |
+
case,
|
| 395 |
+
llm_model=self.llm_model,
|
| 396 |
+
temperature=self.temperature,
|
| 397 |
+
llm_attempts=self.llm_attempts,
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def parse_args():
|
| 402 |
+
parser = argparse.ArgumentParser(description="Build debate memory bank from historical runs.")
|
| 403 |
+
parser.add_argument(
|
| 404 |
+
"--runs_root",
|
| 405 |
+
type=str,
|
| 406 |
+
default=str(DEFAULT_RUNS_ROOT),
|
| 407 |
+
help="Directory containing debate run artifacts.",
|
| 408 |
+
)
|
| 409 |
+
parser.add_argument(
|
| 410 |
+
"--output_dir",
|
| 411 |
+
type=str,
|
| 412 |
+
default=str(DEFAULT_DEBATE_MEMORY_DIR),
|
| 413 |
+
help="Directory to store the debate memory bank.",
|
| 414 |
+
)
|
| 415 |
+
parser.add_argument(
|
| 416 |
+
"--datasets",
|
| 417 |
+
type=str,
|
| 418 |
+
nargs="*",
|
| 419 |
+
default=None,
|
| 420 |
+
help="Optional dataset filters (case-insensitive).",
|
| 421 |
+
)
|
| 422 |
+
parser.add_argument(
|
| 423 |
+
"--mismatch_tolerance",
|
| 424 |
+
type=float,
|
| 425 |
+
default=1e-3,
|
| 426 |
+
help="Minimum absolute difference between initial results to consider a disagreement.",
|
| 427 |
+
)
|
| 428 |
+
parser.add_argument(
|
| 429 |
+
"--llm_model",
|
| 430 |
+
type=str,
|
| 431 |
+
default=None,
|
| 432 |
+
help="Optional model name for LLM-based summaries. If omitted, heuristic summaries are used.",
|
| 433 |
+
)
|
| 434 |
+
parser.add_argument(
|
| 435 |
+
"--temperature",
|
| 436 |
+
type=float,
|
| 437 |
+
default=0.3,
|
| 438 |
+
help="Temperature for LLM summaries.",
|
| 439 |
+
)
|
| 440 |
+
parser.add_argument(
|
| 441 |
+
"--max_workers",
|
| 442 |
+
type=int,
|
| 443 |
+
default=4,
|
| 444 |
+
help="Parallel workers for summary generation.",
|
| 445 |
+
)
|
| 446 |
+
parser.add_argument(
|
| 447 |
+
"--llm_attempts",
|
| 448 |
+
type=int,
|
| 449 |
+
default=2,
|
| 450 |
+
help="Number of LLM attempts per case before falling back to heuristics.",
|
| 451 |
+
)
|
| 452 |
+
parser.add_argument(
|
| 453 |
+
"--dry_run",
|
| 454 |
+
action="store_true",
|
| 455 |
+
help="Run the pipeline without writing to the memory bank.",
|
| 456 |
+
)
|
| 457 |
+
return parser.parse_args()
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
def main():
|
| 461 |
+
args = parse_args()
|
| 462 |
+
builder = DebateMemoryBuilder(
|
| 463 |
+
runs_root=Path(args.runs_root),
|
| 464 |
+
memory_dir=Path(args.output_dir),
|
| 465 |
+
mismatch_tolerance=args.mismatch_tolerance,
|
| 466 |
+
llm_model=args.llm_model,
|
| 467 |
+
temperature=args.temperature,
|
| 468 |
+
llm_attempts=args.llm_attempts,
|
| 469 |
+
max_workers=args.max_workers,
|
| 470 |
+
datasets=args.datasets,
|
| 471 |
+
dry_run=args.dry_run,
|
| 472 |
+
)
|
| 473 |
+
builder.build()
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
if __name__ == "__main__":
|
| 477 |
+
main()
|
src/debate_memory/debug_executor.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Execute generated Python code and capture basic diagnostics."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import subprocess
|
| 9 |
+
import sys
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
AUTO_OBJECTIVE_SNIPPET = """
|
| 15 |
+
# Auto-added snippet: attempt to print the objective value for downstream evaluation.
|
| 16 |
+
try:
|
| 17 |
+
candidate = None
|
| 18 |
+
for name in ("model", "m", "Model"):
|
| 19 |
+
if name in globals():
|
| 20 |
+
candidate = globals()[name]
|
| 21 |
+
break
|
| 22 |
+
if candidate is not None and hasattr(candidate, "objVal"):
|
| 23 |
+
print(f"OBJECTIVE_VALUE: {candidate.objVal}")
|
| 24 |
+
except Exception:
|
| 25 |
+
pass
|
| 26 |
+
""".strip()
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class ExecutionResult:
|
| 31 |
+
status: str
|
| 32 |
+
stdout: str
|
| 33 |
+
stderr: str
|
| 34 |
+
objective_value: Optional[float]
|
| 35 |
+
returncode: Optional[int]
|
| 36 |
+
code_path: Optional[str]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _ensure_directory(path: str) -> None:
|
| 40 |
+
os.makedirs(path, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _append_objective_snippet(code: str) -> str:
|
| 44 |
+
if "OBJECTIVE_VALUE" in code:
|
| 45 |
+
return code if code.endswith("\n") else code + "\n"
|
| 46 |
+
return f"{code.rstrip()}\n\n{AUTO_OBJECTIVE_SNIPPET}\n"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _normalize_output(value: object) -> str:
|
| 50 |
+
if value is None:
|
| 51 |
+
return ""
|
| 52 |
+
if isinstance(value, bytes):
|
| 53 |
+
return value.decode("utf-8", errors="replace")
|
| 54 |
+
return str(value)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _extract_objective_value(output: str) -> Optional[float]:
|
| 58 |
+
if not output:
|
| 59 |
+
return None
|
| 60 |
+
patterns = [
|
| 61 |
+
r"OBJECTIVE_VALUE:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
|
| 62 |
+
r"Optimal\s+[Oo]bjective[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
|
| 63 |
+
r"Obj:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
|
| 64 |
+
r"Objective\s+value:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
|
| 65 |
+
]
|
| 66 |
+
for pattern in patterns:
|
| 67 |
+
match = re.search(pattern, output, re.IGNORECASE)
|
| 68 |
+
if not match:
|
| 69 |
+
continue
|
| 70 |
+
try:
|
| 71 |
+
return float(match.group(1))
|
| 72 |
+
except ValueError:
|
| 73 |
+
continue
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def execute_generated_code(
|
| 78 |
+
code: str,
|
| 79 |
+
problem_id: int,
|
| 80 |
+
output_dir: str,
|
| 81 |
+
timeout: int = 120,
|
| 82 |
+
) -> ExecutionResult:
|
| 83 |
+
"""Write code to disk, execute it, and capture the outcome."""
|
| 84 |
+
code_dir = os.path.join(output_dir, "code")
|
| 85 |
+
_ensure_directory(code_dir)
|
| 86 |
+
|
| 87 |
+
code_with_snippet = _append_objective_snippet(code)
|
| 88 |
+
code_file = os.path.join(code_dir, f"problem_{problem_id}.py")
|
| 89 |
+
with open(code_file, "w", encoding="utf-8") as fh:
|
| 90 |
+
fh.write(code_with_snippet)
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
completed = subprocess.run(
|
| 94 |
+
[sys.executable, os.path.basename(code_file)],
|
| 95 |
+
cwd=code_dir,
|
| 96 |
+
capture_output=True,
|
| 97 |
+
text=True,
|
| 98 |
+
timeout=timeout,
|
| 99 |
+
)
|
| 100 |
+
except subprocess.TimeoutExpired as exc:
|
| 101 |
+
return ExecutionResult(
|
| 102 |
+
status="timeout",
|
| 103 |
+
stdout=_normalize_output(exc.stdout),
|
| 104 |
+
stderr=f"Execution timeout after {timeout} seconds",
|
| 105 |
+
objective_value=None,
|
| 106 |
+
returncode=None,
|
| 107 |
+
code_path=code_file,
|
| 108 |
+
)
|
| 109 |
+
except Exception as exc: # pragma: no cover - defensive
|
| 110 |
+
return ExecutionResult(
|
| 111 |
+
status="error",
|
| 112 |
+
stdout="",
|
| 113 |
+
stderr=str(exc),
|
| 114 |
+
objective_value=None,
|
| 115 |
+
returncode=None,
|
| 116 |
+
code_path=code_file,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
stdout = _normalize_output(completed.stdout)
|
| 120 |
+
stderr = _normalize_output(completed.stderr)
|
| 121 |
+
returncode = completed.returncode
|
| 122 |
+
|
| 123 |
+
status = "success" if returncode == 0 else "execution_error"
|
| 124 |
+
objective_value = _extract_objective_value(stdout) if status == "success" else None
|
| 125 |
+
|
| 126 |
+
return ExecutionResult(
|
| 127 |
+
status=status,
|
| 128 |
+
stdout=stdout,
|
| 129 |
+
stderr=stderr,
|
| 130 |
+
objective_value=objective_value,
|
| 131 |
+
returncode=returncode,
|
| 132 |
+
code_path=code_file,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
__all__ = ["ExecutionResult", "execute_generated_code"]
|
src/debate_memory/debug_memory.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Lightweight persistence for debugging experiences."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import hashlib
|
| 7 |
+
import json
|
| 8 |
+
import threading
|
| 9 |
+
from dataclasses import dataclass, asdict
|
| 10 |
+
from datetime import datetime, timezone
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Any, Dict, Iterable, List, Optional
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _now_iso() -> str:
|
| 16 |
+
return datetime.now(timezone.utc).isoformat()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _normalise_error(text: str) -> str:
|
| 20 |
+
return (text or "").strip()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class DebugRecord:
|
| 25 |
+
"""Single debugging observation stored on disk."""
|
| 26 |
+
|
| 27 |
+
signature: str
|
| 28 |
+
status: str
|
| 29 |
+
error_text: str
|
| 30 |
+
guidance: str
|
| 31 |
+
problem_id: Optional[int]
|
| 32 |
+
description: str
|
| 33 |
+
metadata: Dict[str, Any]
|
| 34 |
+
timestamp: str
|
| 35 |
+
|
| 36 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 37 |
+
return asdict(self)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
_PKG_DIR = Path(__file__).resolve().parent
|
| 41 |
+
_PROJECT_ROOT = _PKG_DIR.parent.parent
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class DebugMemoryStore:
|
| 45 |
+
"""Append-only store keyed by error signature."""
|
| 46 |
+
|
| 47 |
+
DEFAULT_PATH = _PROJECT_ROOT / "memory_storage" / "debug_memory.jsonl"
|
| 48 |
+
|
| 49 |
+
def __init__(self, path: Optional[str] = None):
|
| 50 |
+
self.path = Path(path) if path else self.DEFAULT_PATH
|
| 51 |
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
if not self.path.exists():
|
| 53 |
+
self.path.touch()
|
| 54 |
+
self._lock = threading.Lock()
|
| 55 |
+
|
| 56 |
+
@staticmethod
|
| 57 |
+
def _signature_from_error(error_text: str, status: str) -> str:
|
| 58 |
+
basis = _normalise_error(error_text)
|
| 59 |
+
if not basis:
|
| 60 |
+
basis = status or "unknown"
|
| 61 |
+
digest = hashlib.sha1(basis.encode("utf-8")).hexdigest()[:12]
|
| 62 |
+
return digest
|
| 63 |
+
|
| 64 |
+
def _append(self, record: DebugRecord) -> None:
|
| 65 |
+
payload = json.dumps(record.to_dict(), ensure_ascii=False)
|
| 66 |
+
with self._lock, self.path.open("a", encoding="utf-8") as fh:
|
| 67 |
+
fh.write(payload + "\n")
|
| 68 |
+
|
| 69 |
+
def record_execution_feedback(
|
| 70 |
+
self,
|
| 71 |
+
*,
|
| 72 |
+
problem_id: Optional[int],
|
| 73 |
+
description: str,
|
| 74 |
+
status: str,
|
| 75 |
+
error_text: str,
|
| 76 |
+
guidance: str,
|
| 77 |
+
source: str,
|
| 78 |
+
metadata: Optional[Dict[str, Any]] = None,
|
| 79 |
+
) -> str:
|
| 80 |
+
"""Persist execution feedback and return the signature used."""
|
| 81 |
+
signature_core = self._signature_from_error(error_text, status)
|
| 82 |
+
signature = f"exec:{signature_core}"
|
| 83 |
+
record = DebugRecord(
|
| 84 |
+
signature=signature,
|
| 85 |
+
status=status or "unknown",
|
| 86 |
+
error_text=_normalise_error(error_text) or status or "",
|
| 87 |
+
guidance=(guidance or "").strip(),
|
| 88 |
+
problem_id=problem_id,
|
| 89 |
+
description=(description or "").strip(),
|
| 90 |
+
metadata={
|
| 91 |
+
"source": source,
|
| 92 |
+
**(metadata or {}),
|
| 93 |
+
},
|
| 94 |
+
timestamp=_now_iso(),
|
| 95 |
+
)
|
| 96 |
+
self._append(record)
|
| 97 |
+
return signature
|
| 98 |
+
|
| 99 |
+
def record_validation_feedback(
|
| 100 |
+
self,
|
| 101 |
+
*,
|
| 102 |
+
problem_id: Optional[int],
|
| 103 |
+
issues: Iterable[str],
|
| 104 |
+
metadata: Optional[Dict[str, Any]] = None,
|
| 105 |
+
source: str = "validation",
|
| 106 |
+
) -> List[str]:
|
| 107 |
+
"""Persist validation feedback items and return the signatures used."""
|
| 108 |
+
signatures: List[str] = []
|
| 109 |
+
for issue in issues:
|
| 110 |
+
if not issue:
|
| 111 |
+
continue
|
| 112 |
+
signature_core = self._signature_from_error(issue, "validation")
|
| 113 |
+
signature = f"validation:{signature_core}"
|
| 114 |
+
record = DebugRecord(
|
| 115 |
+
signature=signature,
|
| 116 |
+
status="validation",
|
| 117 |
+
error_text=_normalise_error(issue),
|
| 118 |
+
guidance="",
|
| 119 |
+
problem_id=problem_id,
|
| 120 |
+
description="",
|
| 121 |
+
metadata={
|
| 122 |
+
"source": source,
|
| 123 |
+
**(metadata or {}),
|
| 124 |
+
},
|
| 125 |
+
timestamp=_now_iso(),
|
| 126 |
+
)
|
| 127 |
+
self._append(record)
|
| 128 |
+
signatures.append(signature)
|
| 129 |
+
return signatures
|
| 130 |
+
|
| 131 |
+
def retrieve_for_problem(self, problem_id: int, limit: int = 3) -> List[DebugRecord]:
|
| 132 |
+
"""Return recent records for a given problem id (best-effort)."""
|
| 133 |
+
if problem_id is None:
|
| 134 |
+
return []
|
| 135 |
+
matches: List[DebugRecord] = []
|
| 136 |
+
with self.path.open("r", encoding="utf-8") as fh:
|
| 137 |
+
for line in fh:
|
| 138 |
+
line = line.strip()
|
| 139 |
+
if not line:
|
| 140 |
+
continue
|
| 141 |
+
try:
|
| 142 |
+
payload = json.loads(line)
|
| 143 |
+
except json.JSONDecodeError:
|
| 144 |
+
continue
|
| 145 |
+
if payload.get("problem_id") != problem_id:
|
| 146 |
+
continue
|
| 147 |
+
matches.append(
|
| 148 |
+
DebugRecord(
|
| 149 |
+
signature=payload.get("signature", ""),
|
| 150 |
+
status=payload.get("status", ""),
|
| 151 |
+
error_text=payload.get("error_text", ""),
|
| 152 |
+
guidance=payload.get("guidance", ""),
|
| 153 |
+
problem_id=payload.get("problem_id"),
|
| 154 |
+
description=payload.get("description", ""),
|
| 155 |
+
metadata=payload.get("metadata", {}) or {},
|
| 156 |
+
timestamp=payload.get("timestamp", ""),
|
| 157 |
+
)
|
| 158 |
+
)
|
| 159 |
+
matches.sort(key=lambda item: item.timestamp, reverse=True)
|
| 160 |
+
return matches[:limit] if limit else matches
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
__all__ = ["DebugMemoryStore", "DebugRecord"]
|
src/debate_memory/debug_memory_builder.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Convert debug_memory.jsonl records into a searchable MemoryBank."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import glob
|
| 7 |
+
import hashlib
|
| 8 |
+
import json
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
from .memory_bank import MemoryBank
|
| 14 |
+
|
| 15 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 16 |
+
LEGACY_ROOT = PROJECT_ROOT.parent / "debate_with_memory"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _default_inputs() -> List[str]:
|
| 20 |
+
candidates = [
|
| 21 |
+
PROJECT_ROOT / "memory_storage" / "debug_memory.jsonl",
|
| 22 |
+
LEGACY_ROOT / "memory_storage" / "debug_memory.jsonl",
|
| 23 |
+
PROJECT_ROOT / "memory_storage" / "backups" / "*" / "debug_memory.jsonl",
|
| 24 |
+
LEGACY_ROOT / "memory_storage" / "backups" / "*" / "debug_memory.jsonl",
|
| 25 |
+
]
|
| 26 |
+
return [str(path) for path in candidates]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _stable_id(signature: str) -> int:
|
| 30 |
+
digest = hashlib.sha1(signature.encode("utf-8")).hexdigest()
|
| 31 |
+
return int(digest[:12], 16)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _parse_timestamp(ts: Optional[str]) -> datetime:
|
| 35 |
+
if not ts:
|
| 36 |
+
return datetime.min
|
| 37 |
+
try:
|
| 38 |
+
return datetime.fromisoformat(ts)
|
| 39 |
+
except ValueError:
|
| 40 |
+
return datetime.min
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def load_debug_records(input_globs: List[str]) -> Dict[str, Dict]:
|
| 44 |
+
records: Dict[str, Dict] = {}
|
| 45 |
+
files: List[str] = []
|
| 46 |
+
for pattern in input_globs:
|
| 47 |
+
files.extend(glob.glob(pattern))
|
| 48 |
+
files = sorted({Path(f) for f in files if Path(f).exists()})
|
| 49 |
+
for file_path in files:
|
| 50 |
+
with file_path.open("r", encoding="utf-8") as fh:
|
| 51 |
+
for line in fh:
|
| 52 |
+
line = line.strip()
|
| 53 |
+
if not line:
|
| 54 |
+
continue
|
| 55 |
+
try:
|
| 56 |
+
record = json.loads(line)
|
| 57 |
+
except json.JSONDecodeError:
|
| 58 |
+
continue
|
| 59 |
+
signature = record.get("signature")
|
| 60 |
+
if not signature:
|
| 61 |
+
continue
|
| 62 |
+
ts = _parse_timestamp(record.get("timestamp"))
|
| 63 |
+
existing = records.get(signature)
|
| 64 |
+
if existing is None or ts > existing.get("_ts", datetime.min):
|
| 65 |
+
record["_ts"] = ts
|
| 66 |
+
records[signature] = record
|
| 67 |
+
return records
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def build_debug_memory(records: Dict[str, Dict], output_dir: Path, clear: bool) -> None:
|
| 71 |
+
if clear and output_dir.exists():
|
| 72 |
+
for child in output_dir.iterdir():
|
| 73 |
+
if child.is_file():
|
| 74 |
+
child.unlink()
|
| 75 |
+
else:
|
| 76 |
+
import shutil
|
| 77 |
+
|
| 78 |
+
shutil.rmtree(child)
|
| 79 |
+
bank = MemoryBank(memory_dir=str(output_dir))
|
| 80 |
+
added = 0
|
| 81 |
+
for signature, record in records.items():
|
| 82 |
+
description = record.get("description", "Unknown problem")
|
| 83 |
+
error_text = record.get("error_text", "")
|
| 84 |
+
guidance = record.get("guidance", "")
|
| 85 |
+
status = record.get("status", "")
|
| 86 |
+
metadata = {
|
| 87 |
+
"signature": signature,
|
| 88 |
+
"status": status,
|
| 89 |
+
"timestamp": record.get("timestamp"),
|
| 90 |
+
**(record.get("metadata") or {}),
|
| 91 |
+
}
|
| 92 |
+
note_lines = ["# Debug Memory Case", f"Signature: {signature}", f"Status: {status}"]
|
| 93 |
+
if guidance:
|
| 94 |
+
note_lines.append(f"Guidance: {guidance}")
|
| 95 |
+
note_lines.append("---")
|
| 96 |
+
if error_text:
|
| 97 |
+
note_lines.append("Error snippet:\n" + error_text)
|
| 98 |
+
note_lines.append("---")
|
| 99 |
+
note_lines.append(f"Source metadata: {metadata}")
|
| 100 |
+
prompt_desc = (
|
| 101 |
+
f"{description}\n\n## Error Details\n```\n{error_text}\n```\n"
|
| 102 |
+
f"## Guidance\n{guidance or 'N/A'}\n"
|
| 103 |
+
)
|
| 104 |
+
problem_id = record.get("problem_id")
|
| 105 |
+
if problem_id is None:
|
| 106 |
+
problem_id = _stable_id(signature)
|
| 107 |
+
try:
|
| 108 |
+
bank.add_case(
|
| 109 |
+
problem_id=int(problem_id),
|
| 110 |
+
problem_desc=prompt_desc,
|
| 111 |
+
solution_code="\n".join(note_lines),
|
| 112 |
+
objective_value=0.0,
|
| 113 |
+
is_correct=True,
|
| 114 |
+
metadata=metadata,
|
| 115 |
+
)
|
| 116 |
+
added += 1
|
| 117 |
+
except Exception as exc: # noqa: BLE001
|
| 118 |
+
print(f"Failed to add debug case {signature}: {exc}")
|
| 119 |
+
print(f"✅ Added {added} debug cases to {output_dir}")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def parse_args():
|
| 123 |
+
parser = argparse.ArgumentParser(description="Build debug memory bank from debug_memory.jsonl records")
|
| 124 |
+
parser.add_argument(
|
| 125 |
+
"--input", nargs="*", default=_default_inputs(), help="Input files/globs containing debug records",
|
| 126 |
+
)
|
| 127 |
+
parser.add_argument(
|
| 128 |
+
"--output_dir",
|
| 129 |
+
type=str,
|
| 130 |
+
default=str(PROJECT_ROOT / "debug_case_memory"),
|
| 131 |
+
help="Where to store the constructed memory bank",
|
| 132 |
+
)
|
| 133 |
+
parser.add_argument(
|
| 134 |
+
"--clear",
|
| 135 |
+
action="store_true",
|
| 136 |
+
help="Remove existing output_dir contents before rebuilding",
|
| 137 |
+
)
|
| 138 |
+
return parser.parse_args()
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def main():
|
| 142 |
+
args = parse_args()
|
| 143 |
+
records = load_debug_records(args.input)
|
| 144 |
+
print(f"Loaded {len(records)} unique debug signatures")
|
| 145 |
+
build_debug_memory(records, Path(args.output_dir), clear=args.clear)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
if __name__ == "__main__":
|
| 149 |
+
main()
|
| 150 |
+
|
src/debate_memory/debug_utils.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Minimal helpers for generated code execution reports."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from dataclasses import dataclass, asdict
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
|
| 11 |
+
from .debug_memory import DebugMemoryStore
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class DebugMetadata:
|
| 16 |
+
problem_id: int
|
| 17 |
+
notes: List[str]
|
| 18 |
+
|
| 19 |
+
def to_json(self) -> str:
|
| 20 |
+
return json.dumps(asdict(self), ensure_ascii=False, indent=2)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def sanitize_code(code: str, problem_id: int):
|
| 24 |
+
"""Ensure code ends with a newline and capture any lightweight notes."""
|
| 25 |
+
metadata = DebugMetadata(problem_id=problem_id, notes=[])
|
| 26 |
+
cleaned = (code or "").rstrip() + "\n" if code else ""
|
| 27 |
+
return cleaned, metadata
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def save_debug_metadata(metadata: DebugMetadata, output_dir: str) -> None:
|
| 31 |
+
"""Persist metadata only when there is something noteworthy."""
|
| 32 |
+
if not metadata.notes:
|
| 33 |
+
return
|
| 34 |
+
debug_dir = os.path.join(output_dir, "debug")
|
| 35 |
+
os.makedirs(debug_dir, exist_ok=True)
|
| 36 |
+
path = os.path.join(debug_dir, f"problem_{metadata.problem_id}.json")
|
| 37 |
+
with open(path, "w", encoding="utf-8") as fh:
|
| 38 |
+
fh.write(metadata.to_json())
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def write_debug_suggestions(
|
| 42 |
+
problem_id: int,
|
| 43 |
+
description: str,
|
| 44 |
+
error_message: str,
|
| 45 |
+
memory_helper,
|
| 46 |
+
memory_bank,
|
| 47 |
+
output_dir: str,
|
| 48 |
+
*,
|
| 49 |
+
status: str,
|
| 50 |
+
debug_store: Optional[DebugMemoryStore] = None,
|
| 51 |
+
top_k_cases: int = 3,
|
| 52 |
+
) -> None:
|
| 53 |
+
"""Write a straightforward debug report and optionally record the memory."""
|
| 54 |
+
_ = memory_helper, memory_bank, top_k_cases # Unused but kept for interface compatibility.
|
| 55 |
+
debug_dir = os.path.join(output_dir, "debug")
|
| 56 |
+
os.makedirs(debug_dir, exist_ok=True)
|
| 57 |
+
path = os.path.join(debug_dir, f"problem_{problem_id}_suggestions.md")
|
| 58 |
+
|
| 59 |
+
lines: List[str] = [
|
| 60 |
+
f"# Debug Report for Problem {problem_id}",
|
| 61 |
+
"",
|
| 62 |
+
f"- **Status:** {status}",
|
| 63 |
+
]
|
| 64 |
+
if description:
|
| 65 |
+
lines.extend(["", "## Description", description.strip(), ""])
|
| 66 |
+
if error_message:
|
| 67 |
+
lines.extend(
|
| 68 |
+
[
|
| 69 |
+
"## Error Traceback",
|
| 70 |
+
"```",
|
| 71 |
+
error_message.strip(),
|
| 72 |
+
"```",
|
| 73 |
+
"",
|
| 74 |
+
]
|
| 75 |
+
)
|
| 76 |
+
else:
|
| 77 |
+
lines.extend(["", "## Error Traceback", "_No traceback captured._", ""])
|
| 78 |
+
|
| 79 |
+
lines.append("## Notes")
|
| 80 |
+
lines.append("")
|
| 81 |
+
lines.append("Automated debugging is not yet implemented. Review the trace above for hints.")
|
| 82 |
+
lines.append("")
|
| 83 |
+
|
| 84 |
+
with open(path, "w", encoding="utf-8") as fh:
|
| 85 |
+
fh.write("\n".join(lines))
|
| 86 |
+
|
| 87 |
+
if debug_store:
|
| 88 |
+
debug_store.record_execution_feedback(
|
| 89 |
+
problem_id=problem_id,
|
| 90 |
+
description=description,
|
| 91 |
+
status=status,
|
| 92 |
+
error_text=error_message or status,
|
| 93 |
+
guidance="Automated debugging is not yet implemented.",
|
| 94 |
+
source="debug_utils.write_debug_suggestions",
|
| 95 |
+
metadata={},
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
__all__ = ["DebugMetadata", "sanitize_code", "save_debug_metadata", "write_debug_suggestions"]
|
src/debate_memory/execute.py
ADDED
|
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Execute and evaluate generated Gurobi code
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
from collections import defaultdict
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Dict, List
|
| 14 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 15 |
+
from tqdm import tqdm
|
| 16 |
+
|
| 17 |
+
from .debug_utils import sanitize_code, save_debug_metadata, write_debug_suggestions
|
| 18 |
+
|
| 19 |
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 20 |
+
PROJECT_ROOT = SCRIPT_DIR.parent.parent
|
| 21 |
+
DEFAULT_MEMORY_DIR = PROJECT_ROOT / "memory_storage"
|
| 22 |
+
DEFAULT_GUIDELINES = DEFAULT_MEMORY_DIR / "category_guidelines.jsonl"
|
| 23 |
+
DEFAULT_DEBUG_MEMORY = DEFAULT_MEMORY_DIR / "debug_memory.jsonl"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_objective_value(output: str) -> float:
|
| 27 |
+
"""
|
| 28 |
+
Extract objective value from Gurobi output
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
output: stdout from Gurobi code execution
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Objective value as float, or None if not found
|
| 35 |
+
"""
|
| 36 |
+
if not output or output.strip() == "":
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
# Common patterns in Gurobi output
|
| 40 |
+
patterns = [
|
| 41 |
+
r'Optimal\s+[Oo]bjective[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
|
| 42 |
+
r'Obj:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
|
| 43 |
+
r'Best\s+objective\s+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
|
| 44 |
+
r'Objective\s+value:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
|
| 45 |
+
r'OBJECTIVE_VALUE:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)', # Our auto-added pattern
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
for pattern in patterns:
|
| 49 |
+
match = re.search(pattern, output, re.IGNORECASE)
|
| 50 |
+
if match:
|
| 51 |
+
try:
|
| 52 |
+
return float(match.group(1))
|
| 53 |
+
except ValueError:
|
| 54 |
+
continue
|
| 55 |
+
|
| 56 |
+
# Fallback: check common custom labels printed by prompts
|
| 57 |
+
fallback_patterns = [
|
| 58 |
+
r'Total\s+Cost[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
|
| 59 |
+
r'Total\s+Profit[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
|
| 60 |
+
r'Total\s+Net\s+Profit[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
|
| 61 |
+
r'Total\s+Revenue[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
for pattern in fallback_patterns:
|
| 65 |
+
match = re.search(pattern, output, re.IGNORECASE)
|
| 66 |
+
if match:
|
| 67 |
+
try:
|
| 68 |
+
return float(match.group(1))
|
| 69 |
+
except ValueError:
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
return None
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def enhance_code_with_objective_print(code: str) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Add objective value printing to code if not already present
|
| 78 |
+
|
| 79 |
+
This helps ensure we can extract the objective value even if
|
| 80 |
+
the generated code doesn't print it explicitly.
|
| 81 |
+
|
| 82 |
+
Note: Always adds a fallback print to handle cases where existing
|
| 83 |
+
prints are conditional (e.g., inside if status == OPTIMAL blocks)
|
| 84 |
+
"""
|
| 85 |
+
# Add code to print objective value (always add as a safety measure)
|
| 86 |
+
enhancement_code = """
|
| 87 |
+
# Auto-added: Print objective value for evaluation (fallback)
|
| 88 |
+
try:
|
| 89 |
+
# Try common variable names for Gurobi model
|
| 90 |
+
if 'model' in dir():
|
| 91 |
+
mdl = model
|
| 92 |
+
elif 'm' in dir():
|
| 93 |
+
mdl = m
|
| 94 |
+
elif 'Model' in dir():
|
| 95 |
+
mdl = Model
|
| 96 |
+
else:
|
| 97 |
+
mdl = None
|
| 98 |
+
|
| 99 |
+
# Fallback: scan globals for a likely Gurobi model instance.
|
| 100 |
+
# This helps when the generated code uses a non-standard variable name.
|
| 101 |
+
if mdl is None:
|
| 102 |
+
try:
|
| 103 |
+
for _name, _val in list(globals().items()):
|
| 104 |
+
try:
|
| 105 |
+
if hasattr(_val, 'objVal') and hasattr(_val, 'optimize'):
|
| 106 |
+
mdl = _val
|
| 107 |
+
break
|
| 108 |
+
except Exception:
|
| 109 |
+
continue
|
| 110 |
+
except Exception:
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
if mdl is not None and hasattr(mdl, 'objVal'):
|
| 114 |
+
try:
|
| 115 |
+
obj_value = mdl.objVal
|
| 116 |
+
print(f"OBJECTIVE_VALUE: {obj_value}")
|
| 117 |
+
except:
|
| 118 |
+
# Model might not have been solved yet
|
| 119 |
+
pass
|
| 120 |
+
except:
|
| 121 |
+
pass
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
return code + "\n" + enhancement_code
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def execute_code(code: str, problem_id: int, output_dir: str, timeout: int = 60) -> Dict:
|
| 128 |
+
"""
|
| 129 |
+
Execute Gurobi code and capture results
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
code: Python code to execute
|
| 133 |
+
problem_id: Problem ID
|
| 134 |
+
output_dir: Directory to save code files
|
| 135 |
+
timeout: Execution timeout in seconds
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
Dictionary with execution results
|
| 139 |
+
"""
|
| 140 |
+
# Create output directory
|
| 141 |
+
code_dir = os.path.join(output_dir, 'code')
|
| 142 |
+
os.makedirs(code_dir, exist_ok=True)
|
| 143 |
+
|
| 144 |
+
sanitized_code, debug_meta = sanitize_code(code, problem_id)
|
| 145 |
+
code_enhanced = enhance_code_with_objective_print(sanitized_code)
|
| 146 |
+
|
| 147 |
+
# Save code to file
|
| 148 |
+
code_file = os.path.join(code_dir, f'problem_{problem_id}.py')
|
| 149 |
+
with open(code_file, 'w', encoding='utf-8') as f:
|
| 150 |
+
f.write(code_enhanced)
|
| 151 |
+
|
| 152 |
+
# Persist debug metadata if anything noteworthy was detected
|
| 153 |
+
save_debug_metadata(debug_meta, output_dir)
|
| 154 |
+
|
| 155 |
+
# Execute code
|
| 156 |
+
try:
|
| 157 |
+
result = subprocess.run(
|
| 158 |
+
[sys.executable, f'problem_{problem_id}.py'],
|
| 159 |
+
capture_output=True,
|
| 160 |
+
text=True,
|
| 161 |
+
timeout=timeout,
|
| 162 |
+
cwd=code_dir
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
stdout = result.stdout
|
| 166 |
+
stderr = result.stderr
|
| 167 |
+
returncode = result.returncode
|
| 168 |
+
|
| 169 |
+
if returncode == 0:
|
| 170 |
+
obj_value = extract_objective_value(stdout)
|
| 171 |
+
if obj_value is not None:
|
| 172 |
+
return {
|
| 173 |
+
'status': 'success',
|
| 174 |
+
'objective_value': obj_value,
|
| 175 |
+
'stdout': stdout,
|
| 176 |
+
'stderr': stderr
|
| 177 |
+
}
|
| 178 |
+
else:
|
| 179 |
+
return {
|
| 180 |
+
'status': 'success_no_objective',
|
| 181 |
+
'objective_value': None,
|
| 182 |
+
'stdout': stdout,
|
| 183 |
+
'stderr': stderr
|
| 184 |
+
}
|
| 185 |
+
else:
|
| 186 |
+
return {
|
| 187 |
+
'status': 'execution_error',
|
| 188 |
+
'objective_value': None,
|
| 189 |
+
'stdout': stdout,
|
| 190 |
+
'stderr': stderr,
|
| 191 |
+
'returncode': returncode
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
except subprocess.TimeoutExpired:
|
| 195 |
+
return {
|
| 196 |
+
'status': 'timeout',
|
| 197 |
+
'objective_value': None,
|
| 198 |
+
'stdout': '',
|
| 199 |
+
'stderr': f'Execution timeout after {timeout} seconds'
|
| 200 |
+
}
|
| 201 |
+
except Exception as e:
|
| 202 |
+
return {
|
| 203 |
+
'status': 'error',
|
| 204 |
+
'objective_value': None,
|
| 205 |
+
'stdout': '',
|
| 206 |
+
'stderr': str(e)
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def check_correctness(pred_obj: float, gt_obj: float, tolerance: float = 0.05,
|
| 211 |
+
use_relative: bool = True) -> bool:
|
| 212 |
+
"""
|
| 213 |
+
Check if predicted objective matches ground truth
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
pred_obj: Predicted objective value
|
| 217 |
+
gt_obj: Ground truth objective value
|
| 218 |
+
tolerance: Tolerance for comparison
|
| 219 |
+
use_relative: Use relative tolerance if True, absolute if False
|
| 220 |
+
|
| 221 |
+
Returns:
|
| 222 |
+
True if values match within tolerance
|
| 223 |
+
"""
|
| 224 |
+
if pred_obj is None or gt_obj is None:
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
try:
|
| 228 |
+
pred_obj = float(pred_obj)
|
| 229 |
+
gt_obj = float(gt_obj)
|
| 230 |
+
|
| 231 |
+
if gt_obj == 0:
|
| 232 |
+
return abs(pred_obj) <= tolerance
|
| 233 |
+
|
| 234 |
+
if use_relative:
|
| 235 |
+
return abs((pred_obj - gt_obj) / gt_obj) <= tolerance
|
| 236 |
+
else:
|
| 237 |
+
return abs(pred_obj - gt_obj) <= tolerance
|
| 238 |
+
except (ValueError, TypeError):
|
| 239 |
+
return False
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def evaluate_results(results: List[Dict], args) -> Dict:
|
| 243 |
+
"""
|
| 244 |
+
Evaluate execution results
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
results: List of result dictionaries
|
| 248 |
+
args: Command line arguments
|
| 249 |
+
|
| 250 |
+
Returns:
|
| 251 |
+
Evaluation report dictionary
|
| 252 |
+
"""
|
| 253 |
+
total = len(results)
|
| 254 |
+
correct = 0
|
| 255 |
+
|
| 256 |
+
status_counts = defaultdict(int)
|
| 257 |
+
correct_ids = []
|
| 258 |
+
incorrect_details = []
|
| 259 |
+
|
| 260 |
+
for result in results:
|
| 261 |
+
status = result['execution_status']
|
| 262 |
+
status_counts[status] += 1
|
| 263 |
+
|
| 264 |
+
if status == 'success' and result['is_correct']:
|
| 265 |
+
correct += 1
|
| 266 |
+
correct_ids.append(result['id'])
|
| 267 |
+
elif status == 'success' and not result['is_correct']:
|
| 268 |
+
incorrect_details.append({
|
| 269 |
+
'id': result['id'],
|
| 270 |
+
'predicted': result['predicted_objective'],
|
| 271 |
+
'ground_truth': result['ground_truth']
|
| 272 |
+
})
|
| 273 |
+
|
| 274 |
+
accuracy = correct / total if total > 0 else 0.0
|
| 275 |
+
|
| 276 |
+
report = {
|
| 277 |
+
'total_problems': total,
|
| 278 |
+
'correct': correct,
|
| 279 |
+
'accuracy': accuracy,
|
| 280 |
+
'status_counts': dict(status_counts),
|
| 281 |
+
'correct_ids': correct_ids,
|
| 282 |
+
'incorrect_details': incorrect_details[:10], # Save first 10 for reference
|
| 283 |
+
'settings': {
|
| 284 |
+
'tolerance': args.tolerance,
|
| 285 |
+
'use_relative_tolerance': args.use_relative_tolerance,
|
| 286 |
+
'timeout': args.timeout
|
| 287 |
+
}
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
return report
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def process_single_problem(gen_result, args):
|
| 294 |
+
"""Process a single problem (for parallel execution)"""
|
| 295 |
+
problem_id = gen_result['id']
|
| 296 |
+
code = gen_result['generated_code']
|
| 297 |
+
gt_answer = gen_result.get('answer')
|
| 298 |
+
|
| 299 |
+
if not code:
|
| 300 |
+
result = {
|
| 301 |
+
'id': problem_id,
|
| 302 |
+
'execution_status': 'no_code',
|
| 303 |
+
'predicted_objective': None,
|
| 304 |
+
'ground_truth': gt_answer,
|
| 305 |
+
'is_correct': False
|
| 306 |
+
}
|
| 307 |
+
else:
|
| 308 |
+
exec_result = execute_code(code, problem_id, args.output_dir, args.timeout)
|
| 309 |
+
|
| 310 |
+
pred_obj = exec_result['objective_value']
|
| 311 |
+
is_correct = False
|
| 312 |
+
|
| 313 |
+
if pred_obj is not None and gt_answer is not None:
|
| 314 |
+
try:
|
| 315 |
+
gt_obj = float(gt_answer)
|
| 316 |
+
is_correct = check_correctness(
|
| 317 |
+
pred_obj, gt_obj,
|
| 318 |
+
args.tolerance,
|
| 319 |
+
args.use_relative_tolerance
|
| 320 |
+
)
|
| 321 |
+
except (ValueError, TypeError):
|
| 322 |
+
is_correct = False
|
| 323 |
+
|
| 324 |
+
result = {
|
| 325 |
+
'id': problem_id,
|
| 326 |
+
'execution_status': exec_result['status'],
|
| 327 |
+
'predicted_objective': pred_obj,
|
| 328 |
+
'ground_truth': gt_answer,
|
| 329 |
+
'is_correct': is_correct,
|
| 330 |
+
'stdout': exec_result['stdout'][:500] if args.save_output else '',
|
| 331 |
+
'stderr': exec_result['stderr'][:500] if args.save_output else ''
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
return result
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def main(args):
|
| 338 |
+
# Load generated results
|
| 339 |
+
if not os.path.exists(args.input_file):
|
| 340 |
+
raise FileNotFoundError(f"Input file not found: {args.input_file}")
|
| 341 |
+
|
| 342 |
+
with open(args.input_file, 'r', encoding='utf-8') as f:
|
| 343 |
+
generated_results = [json.loads(line) for line in f if line.strip()]
|
| 344 |
+
|
| 345 |
+
print(f"Loaded {len(generated_results)} generated results")
|
| 346 |
+
|
| 347 |
+
# Create output directory
|
| 348 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 349 |
+
id_to_problem = {record['id']: record for record in generated_results}
|
| 350 |
+
|
| 351 |
+
debug_store = None
|
| 352 |
+
memory_helper = None
|
| 353 |
+
memory_bank = None
|
| 354 |
+
if not args.disable_debug_memory:
|
| 355 |
+
try:
|
| 356 |
+
from .debug_memory import DebugMemoryStore
|
| 357 |
+
from .memory_bank import MemoryBank
|
| 358 |
+
from .memory_intelligence import MemoryIntelligence
|
| 359 |
+
except ModuleNotFoundError as exc:
|
| 360 |
+
print(
|
| 361 |
+
f"⚠️ Debug-memory dependencies missing ({exc}). "
|
| 362 |
+
"Continuing with --disable_debug_memory behavior."
|
| 363 |
+
)
|
| 364 |
+
args.disable_debug_memory = True
|
| 365 |
+
else:
|
| 366 |
+
debug_store = DebugMemoryStore(args.debug_memory_path)
|
| 367 |
+
if args.category_guidelines_path:
|
| 368 |
+
try:
|
| 369 |
+
memory_helper = MemoryIntelligence(args.category_guidelines_path)
|
| 370 |
+
except Exception as exc: # noqa: BLE001
|
| 371 |
+
print(f"Warning: failed to load category guidelines ({exc})")
|
| 372 |
+
if args.memory_dir:
|
| 373 |
+
try:
|
| 374 |
+
if args.embedding_model:
|
| 375 |
+
memory_bank = MemoryBank(args.memory_dir, embedding_model=args.embedding_model)
|
| 376 |
+
else:
|
| 377 |
+
memory_bank = MemoryBank(args.memory_dir)
|
| 378 |
+
except Exception as exc: # noqa: BLE001
|
| 379 |
+
print(f"Warning: failed to load memory bank from {args.memory_dir} ({exc})")
|
| 380 |
+
|
| 381 |
+
# Execute and evaluate each result
|
| 382 |
+
evaluation_results = []
|
| 383 |
+
|
| 384 |
+
if args.num_workers > 1:
|
| 385 |
+
# Parallel execution
|
| 386 |
+
print(f"Using {args.num_workers} workers for parallel execution")
|
| 387 |
+
with ProcessPoolExecutor(max_workers=args.num_workers) as executor:
|
| 388 |
+
# Submit all tasks
|
| 389 |
+
future_to_problem = {
|
| 390 |
+
executor.submit(process_single_problem, gen_result, args): gen_result
|
| 391 |
+
for gen_result in generated_results
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
# Collect results with progress bar
|
| 395 |
+
with tqdm(total=len(generated_results), desc="Executing") as pbar:
|
| 396 |
+
for future in as_completed(future_to_problem):
|
| 397 |
+
try:
|
| 398 |
+
result = future.result()
|
| 399 |
+
evaluation_results.append(result)
|
| 400 |
+
status_symbol = '✓' if result['is_correct'] else '✗'
|
| 401 |
+
pbar.set_postfix_str(f"Problem {result['id']}: {status_symbol}")
|
| 402 |
+
pbar.update(1)
|
| 403 |
+
except Exception as e:
|
| 404 |
+
gen_result = future_to_problem[future]
|
| 405 |
+
print(f"\nError processing problem {gen_result['id']}: {e}")
|
| 406 |
+
evaluation_results.append({
|
| 407 |
+
'id': gen_result['id'],
|
| 408 |
+
'execution_status': 'error',
|
| 409 |
+
'predicted_objective': None,
|
| 410 |
+
'ground_truth': gen_result.get('answer'),
|
| 411 |
+
'is_correct': False,
|
| 412 |
+
'stdout': '',
|
| 413 |
+
'stderr': str(e)
|
| 414 |
+
})
|
| 415 |
+
pbar.update(1)
|
| 416 |
+
|
| 417 |
+
# Sort results by ID to maintain order
|
| 418 |
+
evaluation_results.sort(key=lambda x: x['id'])
|
| 419 |
+
else:
|
| 420 |
+
# Sequential execution (original behavior)
|
| 421 |
+
for gen_result in generated_results:
|
| 422 |
+
problem_id = gen_result['id']
|
| 423 |
+
print(f"Processing problem {problem_id}...", end=' ')
|
| 424 |
+
|
| 425 |
+
result = process_single_problem(gen_result, args)
|
| 426 |
+
evaluation_results.append(result)
|
| 427 |
+
|
| 428 |
+
status_symbol = '✓' if result['is_correct'] else '✗'
|
| 429 |
+
print(f"{status_symbol} [{result['execution_status']}]")
|
| 430 |
+
|
| 431 |
+
# Provide memory-aided suggestions for failures
|
| 432 |
+
if not args.disable_debug_memory:
|
| 433 |
+
for result in evaluation_results:
|
| 434 |
+
status = result['execution_status']
|
| 435 |
+
if status in ('execution_error', 'success_no_objective', 'timeout', 'no_code'):
|
| 436 |
+
gen_result = id_to_problem.get(result['id'], {})
|
| 437 |
+
description = gen_result.get('description', '')
|
| 438 |
+
error_message = result.get('stderr') or result.get('stdout') or ''
|
| 439 |
+
if not error_message:
|
| 440 |
+
if status == 'timeout':
|
| 441 |
+
error_message = 'Execution timeout'
|
| 442 |
+
elif status == 'no_code':
|
| 443 |
+
error_message = 'No code was generated for execution.'
|
| 444 |
+
elif status == 'success_no_objective':
|
| 445 |
+
error_message = 'Execution succeeded but no objective value was captured.'
|
| 446 |
+
write_debug_suggestions(
|
| 447 |
+
problem_id=result['id'],
|
| 448 |
+
description=description,
|
| 449 |
+
error_message=error_message,
|
| 450 |
+
memory_helper=memory_helper,
|
| 451 |
+
memory_bank=memory_bank,
|
| 452 |
+
output_dir=args.output_dir,
|
| 453 |
+
status=status,
|
| 454 |
+
debug_store=debug_store,
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
# Generate evaluation report
|
| 458 |
+
report = evaluate_results(evaluation_results, args)
|
| 459 |
+
|
| 460 |
+
# Save detailed results
|
| 461 |
+
results_file = os.path.join(args.output_dir, 'evaluation_results.jsonl')
|
| 462 |
+
with open(results_file, 'w', encoding='utf-8') as f:
|
| 463 |
+
for result in evaluation_results:
|
| 464 |
+
f.write(json.dumps(result, ensure_ascii=False) + '\n')
|
| 465 |
+
|
| 466 |
+
# Save evaluation report
|
| 467 |
+
report_file = os.path.join(args.output_dir, 'evaluation_report.json')
|
| 468 |
+
with open(report_file, 'w', encoding='utf-8') as f:
|
| 469 |
+
json.dump(report, f, indent=2, ensure_ascii=False)
|
| 470 |
+
|
| 471 |
+
# Print summary
|
| 472 |
+
print(f"\n{'='*60}")
|
| 473 |
+
print("EVALUATION SUMMARY")
|
| 474 |
+
print(f"{'='*60}")
|
| 475 |
+
print(f"Total problems: {report['total_problems']}")
|
| 476 |
+
print(f"Correct: {report['correct']}")
|
| 477 |
+
print(f"Accuracy: {report['accuracy']:.2%}")
|
| 478 |
+
print(f"\nStatus breakdown:")
|
| 479 |
+
for status, count in sorted(report['status_counts'].items()):
|
| 480 |
+
print(f" {status:20s}: {count:3d} ({count/report['total_problems']:.1%})")
|
| 481 |
+
print(f"{'='*60}")
|
| 482 |
+
print(f"\nResults saved to:")
|
| 483 |
+
print(f" {results_file}")
|
| 484 |
+
print(f" {report_file}")
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def parse_args():
|
| 488 |
+
parser = argparse.ArgumentParser(description="Execute and evaluate generated Gurobi code")
|
| 489 |
+
|
| 490 |
+
parser.add_argument("--input_file", type=str, required=True,
|
| 491 |
+
help="Path to generated results JSONL file")
|
| 492 |
+
parser.add_argument("--output_dir", type=str, required=True,
|
| 493 |
+
help="Directory to save execution results")
|
| 494 |
+
parser.add_argument("--timeout", type=int, default=60,
|
| 495 |
+
help="Timeout for code execution (seconds)")
|
| 496 |
+
parser.add_argument("--tolerance", type=float, default=0.05,
|
| 497 |
+
help="Tolerance for answer comparison")
|
| 498 |
+
parser.add_argument("--use_relative_tolerance", action="store_true",
|
| 499 |
+
help="Use relative tolerance (default: absolute)")
|
| 500 |
+
parser.add_argument("--save_output", action="store_true",
|
| 501 |
+
help="Save stdout/stderr in results")
|
| 502 |
+
parser.add_argument("--num_workers", type=int, default=100,
|
| 503 |
+
help="Number of parallel workers for execution")
|
| 504 |
+
parser.add_argument("--memory_dir", type=str, default=str(DEFAULT_MEMORY_DIR),
|
| 505 |
+
help="Path to episodic memory directory (used for debug suggestions)")
|
| 506 |
+
parser.add_argument("--embedding_model", type=str, default=None,
|
| 507 |
+
help="Optional embedding model name or local path for debug-memory retrieval")
|
| 508 |
+
parser.add_argument("--category_guidelines_path", type=str,
|
| 509 |
+
default=str(DEFAULT_GUIDELINES),
|
| 510 |
+
help="Path to category guideline JSONL file")
|
| 511 |
+
parser.add_argument("--debug_memory_path", type=str,
|
| 512 |
+
default=str(DEFAULT_DEBUG_MEMORY),
|
| 513 |
+
help="Path to persistent debug memory JSONL file")
|
| 514 |
+
parser.add_argument("--disable_debug_memory", action="store_true",
|
| 515 |
+
help="Disable memory-based debug suggestions")
|
| 516 |
+
|
| 517 |
+
return parser.parse_args()
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
if __name__ == "__main__":
|
| 521 |
+
args = parse_args()
|
| 522 |
+
main(args)
|
src/debate_memory/generate_with_memory.py
ADDED
|
@@ -0,0 +1,920 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate with Memory: Single solution generation enhanced by memory retrieval
|
| 3 |
+
Based on simple_rag/generate.py + memory enhancement
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from collections import Counter
|
| 12 |
+
from typing import Dict, List, Optional
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
+
|
| 16 |
+
# Import local utilities
|
| 17 |
+
from .llm import get_response
|
| 18 |
+
from .config import find_benchmark_path, get_prompt_template, normalize_dataset_name
|
| 19 |
+
|
| 20 |
+
# Import memory bank
|
| 21 |
+
from .memory_bank import MemoryBank
|
| 22 |
+
from .debug_memory import DebugMemoryStore
|
| 23 |
+
from .debug_executor import execute_generated_code, ExecutionResult
|
| 24 |
+
|
| 25 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 26 |
+
DEFAULT_MEMORY_DIR = PROJECT_ROOT / "memory_storage"
|
| 27 |
+
DEFAULT_DEBUG_MEMORY = DEFAULT_MEMORY_DIR / "debug_memory.jsonl"
|
| 28 |
+
DEFAULT_DEBUG_CASE_MEMORY = PROJECT_ROOT / "debug_case_memory"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class NoOpMemoryBank:
|
| 32 |
+
"""Memory-bank stub used when retrieval is explicitly disabled."""
|
| 33 |
+
|
| 34 |
+
case_count = 0
|
| 35 |
+
|
| 36 |
+
def retrieve_similar_cases(self, query: str, top_k: int = 0):
|
| 37 |
+
return []
|
| 38 |
+
|
| 39 |
+
def format_retrieved_cases_for_prompt(self, similar_cases):
|
| 40 |
+
return ""
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def load_dataset(dataset_name: str) -> List[Dict]:
|
| 44 |
+
"""
|
| 45 |
+
Load dataset from the migrated benchmark directory layout.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
dataset_name: Name of the dataset (e.g., "ComplexLP", "IndustryOR")
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
List of problem dictionaries with 'description' and 'answer' fields
|
| 52 |
+
"""
|
| 53 |
+
dataset_name = normalize_dataset_name(dataset_name)
|
| 54 |
+
dataset_path = find_benchmark_path(PROJECT_ROOT, dataset_name)
|
| 55 |
+
|
| 56 |
+
problems = []
|
| 57 |
+
with dataset_path.open('r', encoding='utf-8') as f:
|
| 58 |
+
for idx, line in enumerate(f):
|
| 59 |
+
if line.strip():
|
| 60 |
+
data = json.loads(line)
|
| 61 |
+
# Map en_question to description if it exists
|
| 62 |
+
if 'en_question' in data and 'description' not in data:
|
| 63 |
+
data['description'] = data['en_question']
|
| 64 |
+
# Map en_answer to answer if it exists
|
| 65 |
+
if 'en_answer' in data and 'answer' not in data:
|
| 66 |
+
data['answer'] = data['en_answer']
|
| 67 |
+
# Set id if not already present
|
| 68 |
+
if 'id' not in data:
|
| 69 |
+
data['id'] = idx
|
| 70 |
+
problems.append(data)
|
| 71 |
+
|
| 72 |
+
print(f"Loaded {len(problems)} problems from {dataset_name}")
|
| 73 |
+
return problems
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def extract_python_code(text: str) -> str:
|
| 77 |
+
"""
|
| 78 |
+
Extract Python code from LLM output
|
| 79 |
+
Looks for code within <python>...</python> tags or ```python...``` blocks
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
text: LLM output text
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
Extracted Python code
|
| 86 |
+
"""
|
| 87 |
+
# Try to extract from <python>...</python> tags first
|
| 88 |
+
pattern_xml = r'<python>(.*?)</python>'
|
| 89 |
+
match = re.search(pattern_xml, text, re.DOTALL | re.IGNORECASE)
|
| 90 |
+
if match:
|
| 91 |
+
code = match.group(1).strip()
|
| 92 |
+
# Remove markdown code fences if present
|
| 93 |
+
code = re.sub(r'^```python\s*\n', '', code)
|
| 94 |
+
code = re.sub(r'\n```\s*$', '', code)
|
| 95 |
+
return code
|
| 96 |
+
|
| 97 |
+
# Try to extract from ```python...``` blocks
|
| 98 |
+
pattern_markdown = r'```python(.*?)```'
|
| 99 |
+
match = re.search(pattern_markdown, text, re.DOTALL)
|
| 100 |
+
if match:
|
| 101 |
+
return match.group(1).strip()
|
| 102 |
+
|
| 103 |
+
# If no code blocks found, return empty string
|
| 104 |
+
return ""
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _truncate_text(text: str, limit: int = 1200) -> str:
|
| 108 |
+
if isinstance(text, bytes):
|
| 109 |
+
text = text.decode("utf-8", errors="replace")
|
| 110 |
+
snippet = (text or "").strip()
|
| 111 |
+
if not snippet:
|
| 112 |
+
return ""
|
| 113 |
+
if len(snippet) <= limit:
|
| 114 |
+
return snippet
|
| 115 |
+
return snippet[:limit] + "\n... (truncated)"
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def write_debug_report(
|
| 119 |
+
problem_id: int,
|
| 120 |
+
description: str,
|
| 121 |
+
exec_result: ExecutionResult,
|
| 122 |
+
base_output_dir: str,
|
| 123 |
+
) -> str:
|
| 124 |
+
debug_dir = os.path.join(base_output_dir, "debug")
|
| 125 |
+
os.makedirs(debug_dir, exist_ok=True)
|
| 126 |
+
path = os.path.join(debug_dir, f"problem_{problem_id}_debug.md")
|
| 127 |
+
|
| 128 |
+
stdout_snippet = _truncate_text(exec_result.stdout)
|
| 129 |
+
stderr_snippet = _truncate_text(exec_result.stderr)
|
| 130 |
+
|
| 131 |
+
lines = [
|
| 132 |
+
f"# Debug Report for Problem {problem_id}",
|
| 133 |
+
"",
|
| 134 |
+
f"- **Status:** {exec_result.status}",
|
| 135 |
+
]
|
| 136 |
+
if exec_result.code_path:
|
| 137 |
+
rel_path = os.path.relpath(exec_result.code_path, base_output_dir)
|
| 138 |
+
lines.append(f"- **Code path:** {rel_path}")
|
| 139 |
+
if description:
|
| 140 |
+
lines.extend(["", "## Description", description.strip()])
|
| 141 |
+
if stdout_snippet:
|
| 142 |
+
lines.extend(["", "## Stdout", "```", stdout_snippet, "```"])
|
| 143 |
+
if stderr_snippet:
|
| 144 |
+
lines.extend(["", "## Stderr", "```", stderr_snippet, "```"])
|
| 145 |
+
if not stdout_snippet and not stderr_snippet:
|
| 146 |
+
lines.extend(["", "## Logs", "_No logs captured._"])
|
| 147 |
+
|
| 148 |
+
with open(path, "w", encoding="utf-8") as fh:
|
| 149 |
+
fh.write("\n".join(lines) + "\n")
|
| 150 |
+
|
| 151 |
+
return path
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def filter_perfect_matches(similar_cases: List[Dict], current_description: str, max_filter: int = 1) -> List[Dict]:
|
| 155 |
+
"""
|
| 156 |
+
Filter out cases with identical description (test set leakage)
|
| 157 |
+
At most max_filter cases will be removed (default: 1)
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
similar_cases: List of retrieved cases with scores
|
| 161 |
+
current_description: The description of current problem to compare against
|
| 162 |
+
max_filter: Maximum number of perfect matches to filter out (default: 1)
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
Filtered list of cases
|
| 166 |
+
"""
|
| 167 |
+
filtered = []
|
| 168 |
+
filtered_count = 0
|
| 169 |
+
|
| 170 |
+
for case in similar_cases:
|
| 171 |
+
case_desc = case['case'].get('description', '')
|
| 172 |
+
problem_id = case['case'].get('problem_id', '?')
|
| 173 |
+
score = case.get('score', 0.0)
|
| 174 |
+
|
| 175 |
+
# Compare descriptions directly (exact match)
|
| 176 |
+
# At most filter max_filter identical cases
|
| 177 |
+
if case_desc.strip() == current_description.strip() and filtered_count < max_filter:
|
| 178 |
+
filtered_count += 1
|
| 179 |
+
print(f" ⚠️ Filtered: Case ID={problem_id}, similarity={score:.4f} (identical description, test set leakage)")
|
| 180 |
+
else:
|
| 181 |
+
filtered.append(case)
|
| 182 |
+
|
| 183 |
+
if filtered_count > 0:
|
| 184 |
+
print(f" 📊 Filtered {filtered_count} perfect match(es) (max: {max_filter}), {len(filtered)} cases remaining")
|
| 185 |
+
|
| 186 |
+
return filtered
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def refine_retrieved_cases_with_llm(
|
| 190 |
+
similar_cases: List[Dict],
|
| 191 |
+
current_problem_desc: str,
|
| 192 |
+
model: str,
|
| 193 |
+
temperature: float = 0.3
|
| 194 |
+
) -> str:
|
| 195 |
+
"""
|
| 196 |
+
Use LLM to analyze ALL retrieved cases together and extract key insights
|
| 197 |
+
|
| 198 |
+
This is a two-stage process:
|
| 199 |
+
1. Retrieve similar cases (vector similarity)
|
| 200 |
+
2. Use LLM to view ALL cases holistically and extract transferable insights
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
similar_cases: List of retrieved cases
|
| 204 |
+
current_problem_desc: Current problem description
|
| 205 |
+
model: Model name for analysis
|
| 206 |
+
temperature: Temperature for analysis (slightly higher for creativity)
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
Refined insights as a string
|
| 210 |
+
"""
|
| 211 |
+
if not similar_cases:
|
| 212 |
+
return ""
|
| 213 |
+
|
| 214 |
+
# Build full cases content (no truncation - show everything to LLM)
|
| 215 |
+
full_cases = ""
|
| 216 |
+
for i, item in enumerate(similar_cases, 1):
|
| 217 |
+
case = item['case']
|
| 218 |
+
score = item['score']
|
| 219 |
+
full_cases += f"\n{'='*70}\n"
|
| 220 |
+
full_cases += f"Case {i} (Similarity Score: {score:.3f})\n"
|
| 221 |
+
full_cases += f"{'='*70}\n\n"
|
| 222 |
+
full_cases += f"**Problem Description:**\n{case['description']}\n\n"
|
| 223 |
+
full_cases += f"**Complete Solution Code:**\n```python\n{case['solution_code']}\n```\n\n"
|
| 224 |
+
full_cases += f"**Objective Value:** {case['objective_value']}\n"
|
| 225 |
+
full_cases += f"**Status:** Correct ✓\n"
|
| 226 |
+
full_cases += "\n"
|
| 227 |
+
|
| 228 |
+
analysis_prompt = f"""You are an expert in optimization modeling. You will analyze multiple similar solved problems to extract **transferable insights** for a new problem.
|
| 229 |
+
|
| 230 |
+
## Current Problem to Solve:
|
| 231 |
+
{current_problem_desc}
|
| 232 |
+
|
| 233 |
+
## Retrieved Similar Cases (Complete):
|
| 234 |
+
{full_cases}
|
| 235 |
+
|
| 236 |
+
## Your Task:
|
| 237 |
+
|
| 238 |
+
Analyze ALL the cases above **holistically** and provide a structured analysis that will guide solving the current problem.
|
| 239 |
+
|
| 240 |
+
**Focus on:**
|
| 241 |
+
|
| 242 |
+
1. **Problem Type & Structure**: What category do these problems fall into? (e.g., production planning, resource allocation, scheduling, network flow)
|
| 243 |
+
|
| 244 |
+
2. **Common Modeling Patterns**:
|
| 245 |
+
- What decision variables are typically used?
|
| 246 |
+
- What types of constraints appear repeatedly?
|
| 247 |
+
- How are objectives typically formulated?
|
| 248 |
+
|
| 249 |
+
3. **Key Techniques & Tricks**:
|
| 250 |
+
- Any specific Gurobi features? (e.g., `addConstrs`, `quicksum`, binary variables, `setParam`)
|
| 251 |
+
- Modeling tricks? (e.g., big-M, indicator constraints, piecewise linear)
|
| 252 |
+
- Data structure patterns? (e.g., dictionaries for indices, list comprehensions)
|
| 253 |
+
|
| 254 |
+
4. **Adaptation Guidance**:
|
| 255 |
+
- What aspects of the current problem are similar to the retrieved cases?
|
| 256 |
+
- What's different and requires new thinking?
|
| 257 |
+
- Which parts of the solution approaches can be directly adapted?
|
| 258 |
+
|
| 259 |
+
**Output Format**:
|
| 260 |
+
Provide a concise, actionable analysis (300-500 words) structured by the 4 points above. Be specific with code patterns and techniques, not just high-level descriptions.
|
| 261 |
+
|
| 262 |
+
**Important**: Extract **transferable knowledge**, not just summarize. Think about what the solver needs to know to adapt these solutions to the current problem."""
|
| 263 |
+
|
| 264 |
+
try:
|
| 265 |
+
analysis = get_response(analysis_prompt, model=model, temperature=temperature)
|
| 266 |
+
return analysis
|
| 267 |
+
except Exception as e:
|
| 268 |
+
print(f" ⚠️ Warning: Failed to refine cases with LLM: {e}")
|
| 269 |
+
# Fallback: return empty string, will use original formatting
|
| 270 |
+
return ""
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def format_debug_cases_for_prompt(cases: List[Dict]) -> str:
|
| 274 |
+
if not cases:
|
| 275 |
+
return ""
|
| 276 |
+
lines = ["# Retrieved Debug Guidance", ""]
|
| 277 |
+
for idx, item in enumerate(cases, 1):
|
| 278 |
+
case = item["case"]
|
| 279 |
+
score = item.get("score")
|
| 280 |
+
signature = case.get("metadata", {}).get("signature", "unknown")
|
| 281 |
+
status = case.get("metadata", {}).get("status", "")
|
| 282 |
+
lines.append(f"## Case {idx} (similarity {score:.3f})")
|
| 283 |
+
lines.append(f"Signature: {signature} | Status: {status}")
|
| 284 |
+
description = case.get("description", "").strip()
|
| 285 |
+
if description:
|
| 286 |
+
lines.append(description if len(description) < 800 else description[:800] + "\n...")
|
| 287 |
+
lines.append("---")
|
| 288 |
+
return "\n".join(lines).strip()
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def build_error_feedback_prompt(
|
| 292 |
+
exec_result: ExecutionResult,
|
| 293 |
+
attempt_number: int,
|
| 294 |
+
previous_code: str,
|
| 295 |
+
debug_guidance: str = ""
|
| 296 |
+
) -> str:
|
| 297 |
+
"""
|
| 298 |
+
Build a prompt with error feedback for code correction
|
| 299 |
+
|
| 300 |
+
Args:
|
| 301 |
+
exec_result: Execution result with error information
|
| 302 |
+
attempt_number: Current attempt number
|
| 303 |
+
previous_code: The code that failed
|
| 304 |
+
|
| 305 |
+
Returns:
|
| 306 |
+
Feedback prompt string
|
| 307 |
+
"""
|
| 308 |
+
error_info = exec_result.stderr if exec_result.stderr else exec_result.stdout
|
| 309 |
+
if not error_info:
|
| 310 |
+
error_info = f"Status: {exec_result.status}"
|
| 311 |
+
|
| 312 |
+
feedback = f"""
|
| 313 |
+
# Code Execution Failed - Attempt {attempt_number}
|
| 314 |
+
|
| 315 |
+
Your previous code failed to execute successfully. Here is the error information:
|
| 316 |
+
|
| 317 |
+
## Error Details:
|
| 318 |
+
```
|
| 319 |
+
{error_info}
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
## Your Previous Code:
|
| 323 |
+
```python
|
| 324 |
+
{previous_code}
|
| 325 |
+
```
|
| 326 |
+
|
| 327 |
+
## Instructions:
|
| 328 |
+
1. Carefully analyze the error message above
|
| 329 |
+
2. Identify the root cause of the error
|
| 330 |
+
3. Fix the code to resolve the issue
|
| 331 |
+
4. Common issues to check:
|
| 332 |
+
- Variable indexing (e.g., accessing index 0 when valid indices start from 1)
|
| 333 |
+
- Missing variable definitions
|
| 334 |
+
- Incorrect constraint formulations
|
| 335 |
+
- Type mismatches
|
| 336 |
+
|
| 337 |
+
Please provide the CORRECTED code in a ```python``` code block. Make sure to:
|
| 338 |
+
- Fix the specific error mentioned above
|
| 339 |
+
- Keep the overall structure and logic intact
|
| 340 |
+
- Ensure all variables are properly defined before use
|
| 341 |
+
"""
|
| 342 |
+
if debug_guidance:
|
| 343 |
+
feedback += f"\n\n# Historical Debug Guidance\n{debug_guidance}\n"
|
| 344 |
+
return feedback
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def generate_with_memory(
|
| 348 |
+
problem_id: int,
|
| 349 |
+
problem_desc: str,
|
| 350 |
+
memory_bank: MemoryBank,
|
| 351 |
+
model: str,
|
| 352 |
+
temperature: float,
|
| 353 |
+
top_k: int = 4,
|
| 354 |
+
filter_perfect: bool = True,
|
| 355 |
+
use_llm_refinement: bool = True,
|
| 356 |
+
*,
|
| 357 |
+
auto_debug: bool = True,
|
| 358 |
+
execution_timeout: int = 120,
|
| 359 |
+
debug_output_dir: Optional[str] = None,
|
| 360 |
+
debug_store: Optional[DebugMemoryStore] = None,
|
| 361 |
+
max_retries: int = 3,
|
| 362 |
+
debug_case_bank: Optional[MemoryBank] = None,
|
| 363 |
+
debug_case_top_k: int = 3
|
| 364 |
+
) -> Dict:
|
| 365 |
+
"""
|
| 366 |
+
Generate solution with memory enhancement
|
| 367 |
+
|
| 368 |
+
Args:
|
| 369 |
+
problem_id: Problem ID
|
| 370 |
+
problem_desc: Problem description
|
| 371 |
+
memory_bank: Memory bank instance
|
| 372 |
+
model: Model name
|
| 373 |
+
temperature: Generation temperature
|
| 374 |
+
top_k: Number of cases to retrieve (default: 4, will filter identical descriptions)
|
| 375 |
+
filter_perfect: Whether to filter out identical description matches
|
| 376 |
+
use_llm_refinement: Whether to use LLM to refine/summarize retrieved cases
|
| 377 |
+
auto_debug: Execute generated code and capture debug information
|
| 378 |
+
execution_timeout: Timeout (seconds) for executing generated code
|
| 379 |
+
debug_output_dir: Directory for storing debug artifacts (code, suggestions)
|
| 380 |
+
debug_store: Persistent store for debug experiences
|
| 381 |
+
|
| 382 |
+
Returns:
|
| 383 |
+
Dict with generation results
|
| 384 |
+
"""
|
| 385 |
+
# Retrieve similar cases from memory
|
| 386 |
+
similar_cases = memory_bank.retrieve_similar_cases(problem_desc, top_k=top_k)
|
| 387 |
+
original_retrieved = len(similar_cases)
|
| 388 |
+
|
| 389 |
+
# Filter out identical descriptions (test set leakage)
|
| 390 |
+
if filter_perfect and similar_cases:
|
| 391 |
+
similar_cases = filter_perfect_matches(similar_cases, problem_desc)
|
| 392 |
+
|
| 393 |
+
# Prepare memory context
|
| 394 |
+
memory_context = ""
|
| 395 |
+
refined_insights = ""
|
| 396 |
+
|
| 397 |
+
if similar_cases:
|
| 398 |
+
if use_llm_refinement:
|
| 399 |
+
# Use LLM to analyze and refine the retrieved cases
|
| 400 |
+
print(f" 🧠 Using LLM to refine {len(similar_cases)} retrieved cases...")
|
| 401 |
+
refined_insights = refine_retrieved_cases_with_llm(
|
| 402 |
+
similar_cases, problem_desc, model, temperature=0.3
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
if refined_insights:
|
| 406 |
+
memory_context = f"""# Insights from Similar Problems in Memory
|
| 407 |
+
|
| 408 |
+
Based on analysis of {len(similar_cases)} similar problems, here are key insights:
|
| 409 |
+
|
| 410 |
+
{refined_insights}
|
| 411 |
+
|
| 412 |
+
---
|
| 413 |
+
|
| 414 |
+
Please use these insights to guide your modeling approach for the current problem.
|
| 415 |
+
"""
|
| 416 |
+
else:
|
| 417 |
+
# Fallback to original formatting if refinement fails
|
| 418 |
+
memory_context = memory_bank.format_retrieved_cases_for_prompt(similar_cases)
|
| 419 |
+
else:
|
| 420 |
+
# Use original formatting (full cases)
|
| 421 |
+
memory_context = memory_bank.format_retrieved_cases_for_prompt(similar_cases)
|
| 422 |
+
|
| 423 |
+
# Build prompt with memory context
|
| 424 |
+
prompt_template = get_prompt_template("default")
|
| 425 |
+
system_prompt = prompt_template["system"]
|
| 426 |
+
user_prompt = prompt_template["user"].format(question=problem_desc)
|
| 427 |
+
|
| 428 |
+
# Inject memory context if available
|
| 429 |
+
if memory_context:
|
| 430 |
+
user_prompt = f"{memory_context}\n\n{user_prompt}"
|
| 431 |
+
|
| 432 |
+
# Generate solution with self-healing retry mechanism
|
| 433 |
+
full_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 434 |
+
|
| 435 |
+
# Calculate prompt length for monitoring
|
| 436 |
+
prompt_length = len(full_prompt)
|
| 437 |
+
prompt_tokens_estimate = prompt_length // 4 # Rough estimate: 1 token ≈ 4 chars
|
| 438 |
+
|
| 439 |
+
# Variables to track across attempts
|
| 440 |
+
attempt_history = []
|
| 441 |
+
final_response = ''
|
| 442 |
+
final_code = ''
|
| 443 |
+
execution_status = 'not_executed'
|
| 444 |
+
execution_stdout = ''
|
| 445 |
+
execution_stderr = ''
|
| 446 |
+
execution_objective = None
|
| 447 |
+
execution_returncode = None
|
| 448 |
+
suggestions_path = ''
|
| 449 |
+
executed_code_path = ''
|
| 450 |
+
debug_signature = ''
|
| 451 |
+
|
| 452 |
+
try:
|
| 453 |
+
# Self-healing loop: try up to max_retries times
|
| 454 |
+
current_prompt = full_prompt
|
| 455 |
+
|
| 456 |
+
for attempt in range(1, max_retries + 1):
|
| 457 |
+
print(f" 🔄 Attempt {attempt}/{max_retries} for problem {problem_id}")
|
| 458 |
+
|
| 459 |
+
# Generate code
|
| 460 |
+
response = get_response(current_prompt, model=model, temperature=temperature)
|
| 461 |
+
code = extract_python_code(response)
|
| 462 |
+
|
| 463 |
+
# Record this attempt
|
| 464 |
+
attempt_info = {
|
| 465 |
+
'attempt_number': attempt,
|
| 466 |
+
'response': response,
|
| 467 |
+
'code': code,
|
| 468 |
+
'execution_status': 'not_executed',
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
if auto_debug and code.strip():
|
| 472 |
+
target_dir = debug_output_dir or os.path.join(os.getcwd(), "auto_debug")
|
| 473 |
+
os.makedirs(target_dir, exist_ok=True)
|
| 474 |
+
|
| 475 |
+
# Execute the generated code
|
| 476 |
+
exec_result = execute_generated_code(
|
| 477 |
+
code,
|
| 478 |
+
problem_id,
|
| 479 |
+
target_dir,
|
| 480 |
+
timeout=execution_timeout,
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
# Update attempt info
|
| 484 |
+
attempt_info['execution_status'] = exec_result.status
|
| 485 |
+
attempt_info['objective_value'] = exec_result.objective_value
|
| 486 |
+
attempt_info['stdout'] = exec_result.stdout[:200] if exec_result.stdout else ''
|
| 487 |
+
attempt_info['stderr'] = exec_result.stderr[:200] if exec_result.stderr else ''
|
| 488 |
+
|
| 489 |
+
# Check if execution was successful
|
| 490 |
+
if exec_result.status == 'success':
|
| 491 |
+
# Success! Use this result
|
| 492 |
+
print(f" ✅ Success on attempt {attempt}")
|
| 493 |
+
execution_status = exec_result.status
|
| 494 |
+
execution_stdout = exec_result.stdout
|
| 495 |
+
execution_stderr = exec_result.stderr
|
| 496 |
+
execution_objective = exec_result.objective_value
|
| 497 |
+
execution_returncode = exec_result.returncode
|
| 498 |
+
executed_code_path = exec_result.code_path or ''
|
| 499 |
+
final_response = response
|
| 500 |
+
final_code = code
|
| 501 |
+
attempt_history.append(attempt_info)
|
| 502 |
+
break # Exit the retry loop
|
| 503 |
+
else:
|
| 504 |
+
# Failure - prepare for retry
|
| 505 |
+
print(f" ❌ Failed on attempt {attempt}: {exec_result.status}")
|
| 506 |
+
execution_status = exec_result.status
|
| 507 |
+
execution_stdout = exec_result.stdout
|
| 508 |
+
execution_stderr = exec_result.stderr
|
| 509 |
+
execution_returncode = exec_result.returncode
|
| 510 |
+
executed_code_path = exec_result.code_path or ''
|
| 511 |
+
final_response = response
|
| 512 |
+
final_code = code
|
| 513 |
+
|
| 514 |
+
# Write debug report
|
| 515 |
+
suggestions_path = write_debug_report(
|
| 516 |
+
problem_id,
|
| 517 |
+
problem_desc,
|
| 518 |
+
exec_result,
|
| 519 |
+
target_dir,
|
| 520 |
+
)
|
| 521 |
+
|
| 522 |
+
# Record to debug store
|
| 523 |
+
error_message = execution_stderr or execution_stdout or execution_status
|
| 524 |
+
if debug_store:
|
| 525 |
+
debug_signature = debug_store.record_execution_feedback(
|
| 526 |
+
problem_id=problem_id,
|
| 527 |
+
description=problem_desc,
|
| 528 |
+
status=execution_status,
|
| 529 |
+
error_text=error_message,
|
| 530 |
+
guidance=f"Attempt {attempt}/{max_retries} failed. Review the debug report.",
|
| 531 |
+
source="generate_with_memory.auto_debug.self_healing",
|
| 532 |
+
metadata={
|
| 533 |
+
"attempt": attempt,
|
| 534 |
+
"returncode": execution_returncode,
|
| 535 |
+
"code_path": executed_code_path,
|
| 536 |
+
},
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
attempt_history.append(attempt_info)
|
| 540 |
+
|
| 541 |
+
# If not the last attempt, prepare retry prompt
|
| 542 |
+
if attempt < max_retries:
|
| 543 |
+
guidance_text = ""
|
| 544 |
+
if debug_case_bank and error_message:
|
| 545 |
+
debug_cases = debug_case_bank.retrieve_similar_cases(
|
| 546 |
+
error_message,
|
| 547 |
+
top_k=debug_case_top_k,
|
| 548 |
+
)
|
| 549 |
+
guidance_text = format_debug_cases_for_prompt(debug_cases)
|
| 550 |
+
error_feedback = build_error_feedback_prompt(
|
| 551 |
+
exec_result,
|
| 552 |
+
attempt,
|
| 553 |
+
code,
|
| 554 |
+
debug_guidance=guidance_text,
|
| 555 |
+
)
|
| 556 |
+
# Append error feedback to the prompt for next attempt
|
| 557 |
+
current_prompt = f"{full_prompt}\n\n{error_feedback}"
|
| 558 |
+
print(f" 🔧 Preparing retry with error feedback...")
|
| 559 |
+
else:
|
| 560 |
+
print(f" ⚠️ Max retries ({max_retries}) reached, giving up")
|
| 561 |
+
|
| 562 |
+
elif not code.strip():
|
| 563 |
+
# No code generated
|
| 564 |
+
attempt_info['execution_status'] = 'no_code'
|
| 565 |
+
attempt_history.append(attempt_info)
|
| 566 |
+
execution_status = 'no_code'
|
| 567 |
+
execution_stderr = 'Generated code block is empty.'
|
| 568 |
+
final_response = response
|
| 569 |
+
final_code = code
|
| 570 |
+
|
| 571 |
+
if attempt < max_retries:
|
| 572 |
+
# Retry with feedback about missing code
|
| 573 |
+
feedback = "\n\nYour previous response did not contain any Python code. Please provide the complete Gurobi code in a ```python``` code block."
|
| 574 |
+
current_prompt = f"{full_prompt}\n\n{feedback}"
|
| 575 |
+
print(f" ⚠️ No code generated, retrying...")
|
| 576 |
+
else:
|
| 577 |
+
print(f" ⚠️ Max retries reached, no code generated")
|
| 578 |
+
break
|
| 579 |
+
|
| 580 |
+
elif not auto_debug:
|
| 581 |
+
# Auto debug disabled, just use the generated code
|
| 582 |
+
execution_status = 'skipped'
|
| 583 |
+
final_response = response
|
| 584 |
+
final_code = code
|
| 585 |
+
attempt_history.append(attempt_info)
|
| 586 |
+
break
|
| 587 |
+
|
| 588 |
+
if auto_debug:
|
| 589 |
+
if execution_status == 'success':
|
| 590 |
+
final_status = 'success'
|
| 591 |
+
elif final_code.strip():
|
| 592 |
+
final_status = 'execution_failed'
|
| 593 |
+
else:
|
| 594 |
+
final_status = 'no_code'
|
| 595 |
+
else:
|
| 596 |
+
final_status = 'success' if final_code.strip() else 'no_code'
|
| 597 |
+
|
| 598 |
+
return {
|
| 599 |
+
'id': problem_id,
|
| 600 |
+
'model': model,
|
| 601 |
+
'temperature': temperature,
|
| 602 |
+
'description': problem_desc,
|
| 603 |
+
'full_input_prompt': full_prompt, # 💾 Complete input for reproducibility
|
| 604 |
+
'refined_insights': refined_insights if use_llm_refinement else '', # LLM-refined insights
|
| 605 |
+
'prompt_length_chars': prompt_length,
|
| 606 |
+
'prompt_length_tokens_est': prompt_tokens_estimate,
|
| 607 |
+
'raw_response': final_response,
|
| 608 |
+
'generated_code': final_code,
|
| 609 |
+
'retrieved_cases': len(similar_cases),
|
| 610 |
+
'original_retrieved': original_retrieved,
|
| 611 |
+
'use_llm_refinement': use_llm_refinement,
|
| 612 |
+
'status': final_status,
|
| 613 |
+
'execution_status': execution_status,
|
| 614 |
+
'execution_stdout': execution_stdout,
|
| 615 |
+
'execution_stderr': execution_stderr,
|
| 616 |
+
'execution_objective_value': execution_objective,
|
| 617 |
+
'execution_returncode': execution_returncode,
|
| 618 |
+
'debug_suggestions_path': suggestions_path,
|
| 619 |
+
'executed_code_path': executed_code_path if executed_code_path else '',
|
| 620 |
+
'debug_signature': debug_signature,
|
| 621 |
+
'auto_debug_enabled': auto_debug,
|
| 622 |
+
'execution_timeout_sec': execution_timeout if auto_debug else None,
|
| 623 |
+
'max_retries': max_retries,
|
| 624 |
+
'total_attempts': len(attempt_history),
|
| 625 |
+
'attempt_history': attempt_history,
|
| 626 |
+
'self_healing_enabled': True,
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
except Exception as e:
|
| 630 |
+
print(f"Error generating solution for problem {problem_id}: {e}")
|
| 631 |
+
|
| 632 |
+
# Still save the prompt even on error
|
| 633 |
+
full_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 634 |
+
|
| 635 |
+
return {
|
| 636 |
+
'id': problem_id,
|
| 637 |
+
'model': model,
|
| 638 |
+
'temperature': temperature,
|
| 639 |
+
'description': problem_desc,
|
| 640 |
+
'full_input_prompt': full_prompt, # Save even on error
|
| 641 |
+
'refined_insights': '',
|
| 642 |
+
'prompt_length_chars': len(full_prompt),
|
| 643 |
+
'prompt_length_tokens_est': len(full_prompt) // 4,
|
| 644 |
+
'raw_response': '',
|
| 645 |
+
'generated_code': '',
|
| 646 |
+
'retrieved_cases': len(similar_cases) if similar_cases else 0,
|
| 647 |
+
'original_retrieved': original_retrieved,
|
| 648 |
+
'use_llm_refinement': use_llm_refinement,
|
| 649 |
+
'status': 'error',
|
| 650 |
+
'error': str(e),
|
| 651 |
+
'execution_status': 'not_executed',
|
| 652 |
+
'execution_stdout': '',
|
| 653 |
+
'execution_stderr': '',
|
| 654 |
+
'execution_objective_value': None,
|
| 655 |
+
'execution_returncode': None,
|
| 656 |
+
'debug_suggestions_path': '',
|
| 657 |
+
'executed_code_path': '',
|
| 658 |
+
'debug_signature': '',
|
| 659 |
+
'auto_debug_enabled': auto_debug,
|
| 660 |
+
'execution_timeout_sec': execution_timeout if auto_debug else None,
|
| 661 |
+
'max_retries': max_retries,
|
| 662 |
+
'total_attempts': 0,
|
| 663 |
+
'attempt_history': [],
|
| 664 |
+
'self_healing_enabled': True,
|
| 665 |
+
}
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
def generate_single_problem(
|
| 669 |
+
problem: Dict,
|
| 670 |
+
memory_bank: MemoryBank,
|
| 671 |
+
model: str,
|
| 672 |
+
temperature: float,
|
| 673 |
+
top_k: int,
|
| 674 |
+
filter_perfect: bool,
|
| 675 |
+
use_llm_refinement: bool,
|
| 676 |
+
*,
|
| 677 |
+
auto_debug: bool,
|
| 678 |
+
execution_timeout: int,
|
| 679 |
+
debug_output_dir: Optional[str],
|
| 680 |
+
debug_store: Optional[DebugMemoryStore],
|
| 681 |
+
max_retries: int = 3,
|
| 682 |
+
debug_case_bank: Optional[MemoryBank] = None,
|
| 683 |
+
debug_case_top_k: int = 3,
|
| 684 |
+
) -> Dict:
|
| 685 |
+
"""
|
| 686 |
+
Wrapper for parallel execution
|
| 687 |
+
"""
|
| 688 |
+
problem_id = problem['id']
|
| 689 |
+
problem_desc = problem['description']
|
| 690 |
+
|
| 691 |
+
result = generate_with_memory(
|
| 692 |
+
problem_id, problem_desc, memory_bank,
|
| 693 |
+
model, temperature, top_k, filter_perfect, use_llm_refinement,
|
| 694 |
+
auto_debug=auto_debug,
|
| 695 |
+
execution_timeout=execution_timeout,
|
| 696 |
+
debug_output_dir=debug_output_dir,
|
| 697 |
+
debug_store=debug_store,
|
| 698 |
+
max_retries=max_retries,
|
| 699 |
+
debug_case_bank=debug_case_bank,
|
| 700 |
+
debug_case_top_k=debug_case_top_k,
|
| 701 |
+
)
|
| 702 |
+
|
| 703 |
+
# Add ground truth
|
| 704 |
+
result['answer'] = problem.get('answer', '')
|
| 705 |
+
|
| 706 |
+
return result
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
def main():
|
| 710 |
+
parser = argparse.ArgumentParser(description='Generate with Memory (parallel single solutions)')
|
| 711 |
+
parser.add_argument('--dataset', type=str, default='IndustryOR',
|
| 712 |
+
help='Dataset name')
|
| 713 |
+
parser.add_argument('--model', type=str, default='gpt-4o',
|
| 714 |
+
help='Model name')
|
| 715 |
+
parser.add_argument('--temperature', type=float, default=0.01,
|
| 716 |
+
help='Temperature for generation')
|
| 717 |
+
parser.add_argument('--max_problems', type=int, default=None,
|
| 718 |
+
help='Maximum number of problems to process')
|
| 719 |
+
parser.add_argument('--output', type=str, required=True,
|
| 720 |
+
help='Output file path (JSONL)')
|
| 721 |
+
parser.add_argument('--memory_dir', type=str, default=str(DEFAULT_MEMORY_DIR),
|
| 722 |
+
help='Memory storage directory')
|
| 723 |
+
parser.add_argument('--embedding_model', type=str, default=None,
|
| 724 |
+
help='Optional embedding model name or local path for memory retrieval')
|
| 725 |
+
parser.add_argument('--memory_top_k', type=int, default=4,
|
| 726 |
+
help='Number of cases to retrieve from memory (default: 4)')
|
| 727 |
+
parser.add_argument('--no_filter_perfect', action='store_true',
|
| 728 |
+
help='Disable filtering of perfect similarity matches')
|
| 729 |
+
parser.add_argument('--use_llm_refinement', action='store_true',
|
| 730 |
+
help='Use LLM to refine/summarize retrieved cases (improves quality, costs more API calls)')
|
| 731 |
+
parser.add_argument('--parallel', type=int, default=5,
|
| 732 |
+
help='Number of parallel workers')
|
| 733 |
+
parser.add_argument('--execution_timeout', type=int, default=120,
|
| 734 |
+
help='Timeout (seconds) for executing generated code during auto-debug')
|
| 735 |
+
parser.add_argument('--no_auto_debug', action='store_true',
|
| 736 |
+
help='Disable automatic execution and debug capture for generated code')
|
| 737 |
+
parser.add_argument('--debug_output_dir', type=str, default=None,
|
| 738 |
+
help='Directory to store auto-debug artifacts (code, logs, suggestions)')
|
| 739 |
+
parser.add_argument('--debug_memory_path', type=str, default=str(DEFAULT_DEBUG_MEMORY),
|
| 740 |
+
help='Path to persistent debug memory JSONL file')
|
| 741 |
+
parser.add_argument('--debug_case_memory_dir', type=str, default=str(DEFAULT_DEBUG_CASE_MEMORY),
|
| 742 |
+
help='Directory containing consolidated debug-case memory (built via build_debug_memory.py)')
|
| 743 |
+
parser.add_argument('--debug_case_memory_top_k', type=int, default=3,
|
| 744 |
+
help='How many debug memory cases to retrieve when execution fails')
|
| 745 |
+
parser.add_argument('--max_retries', type=int, default=3,
|
| 746 |
+
help='Maximum number of retry attempts for self-healing (default: 3)')
|
| 747 |
+
|
| 748 |
+
args = parser.parse_args()
|
| 749 |
+
|
| 750 |
+
args.dataset = normalize_dataset_name(args.dataset)
|
| 751 |
+
|
| 752 |
+
auto_debug_enabled = not args.no_auto_debug
|
| 753 |
+
debug_output_dir = args.debug_output_dir
|
| 754 |
+
debug_store: Optional[DebugMemoryStore] = None
|
| 755 |
+
if auto_debug_enabled:
|
| 756 |
+
if debug_output_dir is None:
|
| 757 |
+
base_dir = os.path.dirname(args.output) or '.'
|
| 758 |
+
debug_output_dir = os.path.join(base_dir, 'auto_debug')
|
| 759 |
+
os.makedirs(debug_output_dir, exist_ok=True)
|
| 760 |
+
debug_store = DebugMemoryStore(args.debug_memory_path)
|
| 761 |
+
else:
|
| 762 |
+
debug_output_dir = None
|
| 763 |
+
|
| 764 |
+
debug_case_bank: Optional[MemoryBank] = None
|
| 765 |
+
if auto_debug_enabled and args.debug_case_memory_top_k > 0 and args.debug_case_memory_dir:
|
| 766 |
+
case_dir = Path(args.debug_case_memory_dir)
|
| 767 |
+
if case_dir.exists():
|
| 768 |
+
try:
|
| 769 |
+
if args.embedding_model:
|
| 770 |
+
debug_case_bank = MemoryBank(
|
| 771 |
+
memory_dir=str(case_dir),
|
| 772 |
+
embedding_model=args.embedding_model,
|
| 773 |
+
)
|
| 774 |
+
else:
|
| 775 |
+
debug_case_bank = MemoryBank(memory_dir=str(case_dir))
|
| 776 |
+
except Exception as exc: # noqa: BLE001
|
| 777 |
+
print(f"⚠️ Warning: failed to load debug-case memory from {case_dir} ({exc})")
|
| 778 |
+
else:
|
| 779 |
+
print(f"ℹ️ Debug-case memory directory not found: {case_dir} (skipping retrieval)")
|
| 780 |
+
|
| 781 |
+
print("="*80)
|
| 782 |
+
print("🧠 Generate with Memory (Parallel)")
|
| 783 |
+
print("="*80)
|
| 784 |
+
print(f"Dataset: {args.dataset}")
|
| 785 |
+
print(f"Model: {args.model}")
|
| 786 |
+
print(f"Temperature: {args.temperature}")
|
| 787 |
+
print(f"Memory dir: {args.memory_dir}")
|
| 788 |
+
if args.embedding_model:
|
| 789 |
+
print(f"Embedding: {args.embedding_model}")
|
| 790 |
+
print(f"Memory Top-K: {args.memory_top_k}")
|
| 791 |
+
print(f"Filter perfect matches: {not args.no_filter_perfect}")
|
| 792 |
+
print(f"LLM Refinement: {'✅ Enabled' if args.use_llm_refinement else '❌ Disabled'}")
|
| 793 |
+
print(f"Parallel: {args.parallel}")
|
| 794 |
+
print(f"Output: {args.output}")
|
| 795 |
+
print(f"Auto Debug: {'✅ Enabled' if auto_debug_enabled else '❌ Disabled'}")
|
| 796 |
+
if auto_debug_enabled:
|
| 797 |
+
print(f" Debug dir: {debug_output_dir}")
|
| 798 |
+
if args.debug_memory_path:
|
| 799 |
+
print(f" Debug memory: {args.debug_memory_path}")
|
| 800 |
+
print(f" Exec timeout: {args.execution_timeout}s")
|
| 801 |
+
print(f" Max retries: {args.max_retries} (Self-healing enabled)")
|
| 802 |
+
print("="*80)
|
| 803 |
+
print()
|
| 804 |
+
|
| 805 |
+
# Initialize memory bank only when retrieval is active.
|
| 806 |
+
if args.memory_top_k > 0:
|
| 807 |
+
print("Initializing memory bank...")
|
| 808 |
+
if args.embedding_model:
|
| 809 |
+
memory_bank = MemoryBank(memory_dir=args.memory_dir, embedding_model=args.embedding_model)
|
| 810 |
+
else:
|
| 811 |
+
memory_bank = MemoryBank(memory_dir=args.memory_dir)
|
| 812 |
+
print()
|
| 813 |
+
else:
|
| 814 |
+
print("Skipping memory bank initialization because memory_top_k=0")
|
| 815 |
+
print()
|
| 816 |
+
memory_bank = NoOpMemoryBank()
|
| 817 |
+
|
| 818 |
+
# Load dataset
|
| 819 |
+
problems = load_dataset(args.dataset)
|
| 820 |
+
if args.max_problems:
|
| 821 |
+
problems = problems[:args.max_problems]
|
| 822 |
+
|
| 823 |
+
print(f"Processing {len(problems)} problems with {args.parallel} workers")
|
| 824 |
+
print()
|
| 825 |
+
|
| 826 |
+
# Create output directory
|
| 827 |
+
os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True)
|
| 828 |
+
|
| 829 |
+
# Parallel generation
|
| 830 |
+
results = []
|
| 831 |
+
|
| 832 |
+
if args.parallel <= 1:
|
| 833 |
+
# Sequential processing
|
| 834 |
+
for problem in tqdm(problems, desc="Generating"):
|
| 835 |
+
result = generate_single_problem(
|
| 836 |
+
problem, memory_bank, args.model, args.temperature,
|
| 837 |
+
args.memory_top_k, not args.no_filter_perfect, args.use_llm_refinement,
|
| 838 |
+
auto_debug=auto_debug_enabled,
|
| 839 |
+
execution_timeout=args.execution_timeout,
|
| 840 |
+
debug_output_dir=debug_output_dir,
|
| 841 |
+
debug_store=debug_store,
|
| 842 |
+
max_retries=args.max_retries,
|
| 843 |
+
debug_case_bank=debug_case_bank,
|
| 844 |
+
debug_case_top_k=args.debug_case_memory_top_k,
|
| 845 |
+
)
|
| 846 |
+
results.append(result)
|
| 847 |
+
else:
|
| 848 |
+
# Parallel processing
|
| 849 |
+
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
| 850 |
+
futures = {
|
| 851 |
+
executor.submit(
|
| 852 |
+
generate_single_problem,
|
| 853 |
+
problem, memory_bank, args.model, args.temperature,
|
| 854 |
+
args.memory_top_k, not args.no_filter_perfect, args.use_llm_refinement,
|
| 855 |
+
auto_debug=auto_debug_enabled,
|
| 856 |
+
execution_timeout=args.execution_timeout,
|
| 857 |
+
debug_output_dir=debug_output_dir,
|
| 858 |
+
debug_store=debug_store,
|
| 859 |
+
max_retries=args.max_retries,
|
| 860 |
+
debug_case_bank=debug_case_bank,
|
| 861 |
+
debug_case_top_k=args.debug_case_memory_top_k,
|
| 862 |
+
): problem for problem in problems
|
| 863 |
+
}
|
| 864 |
+
|
| 865 |
+
for future in tqdm(as_completed(futures), total=len(problems), desc="Generating"):
|
| 866 |
+
try:
|
| 867 |
+
result = future.result()
|
| 868 |
+
results.append(result)
|
| 869 |
+
except Exception as e:
|
| 870 |
+
problem = futures[future]
|
| 871 |
+
print(f"Error processing problem {problem['id']}: {e}")
|
| 872 |
+
|
| 873 |
+
# Sort by problem ID
|
| 874 |
+
results.sort(key=lambda x: x['id'])
|
| 875 |
+
|
| 876 |
+
# Save results
|
| 877 |
+
with open(args.output, 'w', encoding='utf-8') as f:
|
| 878 |
+
for result in results:
|
| 879 |
+
f.write(json.dumps(result, ensure_ascii=False) + '\n')
|
| 880 |
+
|
| 881 |
+
print()
|
| 882 |
+
print("="*80)
|
| 883 |
+
print("✅ Generation Complete")
|
| 884 |
+
print("="*80)
|
| 885 |
+
print(f"Total problems: {len(results)}")
|
| 886 |
+
status_counts = Counter(r.get('status', 'unknown') for r in results)
|
| 887 |
+
print(f"Successful: {status_counts.get('success', 0)}")
|
| 888 |
+
print(f"Errors: {status_counts.get('error', 0)}")
|
| 889 |
+
print(f"Results saved to: {args.output}")
|
| 890 |
+
if status_counts:
|
| 891 |
+
print("Status breakdown:")
|
| 892 |
+
for status, count in sorted(status_counts.items()):
|
| 893 |
+
print(f" {status:<18}: {count}")
|
| 894 |
+
|
| 895 |
+
# Memory statistics
|
| 896 |
+
total_retrieved = sum(r.get('retrieved_cases', 0) for r in results)
|
| 897 |
+
total_original = sum(r.get('original_retrieved', 0) for r in results)
|
| 898 |
+
filtered = total_original - total_retrieved
|
| 899 |
+
|
| 900 |
+
# Prompt length statistics
|
| 901 |
+
prompt_lengths = [r.get('prompt_length_tokens_est', 0) for r in results if r.get('status') == 'success']
|
| 902 |
+
avg_prompt_tokens = sum(prompt_lengths) / len(prompt_lengths) if prompt_lengths else 0
|
| 903 |
+
max_prompt_tokens = max(prompt_lengths) if prompt_lengths else 0
|
| 904 |
+
|
| 905 |
+
print()
|
| 906 |
+
print("Memory Statistics:")
|
| 907 |
+
print(f" Total retrievals: {total_original}")
|
| 908 |
+
print(f" After filtering: {total_retrieved}")
|
| 909 |
+
print(f" Filtered out: {filtered} (perfect matches)")
|
| 910 |
+
print(f" Avg per problem: {total_retrieved / len(results):.2f}")
|
| 911 |
+
print()
|
| 912 |
+
print("Prompt Length Statistics:")
|
| 913 |
+
print(f" Avg prompt tokens: {avg_prompt_tokens:.0f}")
|
| 914 |
+
print(f" Max prompt tokens: {max_prompt_tokens:.0f}")
|
| 915 |
+
print(f" ℹ️ All prompts saved in 'full_input_prompt' field")
|
| 916 |
+
print("="*80)
|
| 917 |
+
|
| 918 |
+
|
| 919 |
+
if __name__ == "__main__":
|
| 920 |
+
main()
|
src/debate_memory/llm.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Lightweight HTTP client for OpenAI-compatible chat completions.
|
| 3 |
+
|
| 4 |
+
- Credentials are read from environment variables only.
|
| 5 |
+
- Supported environment variables:
|
| 6 |
+
* `LLM_API_BASE_URL`
|
| 7 |
+
* `LLM_API_KEY`
|
| 8 |
+
* `OPENAI_BASE_URL`
|
| 9 |
+
* `OPENAI_API_KEY`
|
| 10 |
+
* `API_URL`
|
| 11 |
+
* `API_KEY`
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import time
|
| 19 |
+
from typing import Dict, List
|
| 20 |
+
|
| 21 |
+
import requests
|
| 22 |
+
|
| 23 |
+
def _get_credentials() -> Dict[str, str]:
|
| 24 |
+
api_key = (
|
| 25 |
+
os.getenv("LLM_API_KEY")
|
| 26 |
+
or os.getenv("OPENAI_API_KEY")
|
| 27 |
+
or os.getenv("API_KEY")
|
| 28 |
+
)
|
| 29 |
+
base_url = (
|
| 30 |
+
os.getenv("LLM_API_BASE_URL")
|
| 31 |
+
or os.getenv("OPENAI_BASE_URL")
|
| 32 |
+
or os.getenv("API_URL")
|
| 33 |
+
)
|
| 34 |
+
if not api_key:
|
| 35 |
+
raise RuntimeError(
|
| 36 |
+
"Missing API key. Set one of: LLM_API_KEY, OPENAI_API_KEY, API_KEY."
|
| 37 |
+
)
|
| 38 |
+
if not base_url:
|
| 39 |
+
raise RuntimeError(
|
| 40 |
+
"Missing API base URL. Set one of: "
|
| 41 |
+
"LLM_API_BASE_URL, OPENAI_BASE_URL, API_URL."
|
| 42 |
+
)
|
| 43 |
+
return {"api_key": api_key, "base_url": base_url.rstrip("/")}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _post_chat_completion(
|
| 47 |
+
messages: List[Dict[str, str]],
|
| 48 |
+
model: str,
|
| 49 |
+
temperature: float,
|
| 50 |
+
max_tokens: int,
|
| 51 |
+
) -> Dict:
|
| 52 |
+
creds = _get_credentials()
|
| 53 |
+
url = f"{creds['base_url']}/chat/completions"
|
| 54 |
+
headers = {
|
| 55 |
+
"Authorization": f"Bearer {creds['api_key']}",
|
| 56 |
+
"Content-Type": "application/json",
|
| 57 |
+
}
|
| 58 |
+
payload = {
|
| 59 |
+
"model": model,
|
| 60 |
+
"messages": messages,
|
| 61 |
+
"temperature": temperature,
|
| 62 |
+
"max_tokens": max_tokens,
|
| 63 |
+
}
|
| 64 |
+
response = requests.post(url, headers=headers, json=payload, timeout=120)
|
| 65 |
+
response.raise_for_status()
|
| 66 |
+
try:
|
| 67 |
+
return response.json()
|
| 68 |
+
except json.JSONDecodeError as exc: # pragma: no cover - defensive
|
| 69 |
+
raise RuntimeError(f"Non-JSON response from LLM API: {response.text[:200]}") from exc
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _extract_content(result: Dict) -> str:
|
| 73 |
+
choices = result.get("choices")
|
| 74 |
+
if not choices:
|
| 75 |
+
raise RuntimeError(f"LLM API response missing 'choices': {result}")
|
| 76 |
+
message = choices[0].get("message") or {}
|
| 77 |
+
content = message.get("content")
|
| 78 |
+
if content is None:
|
| 79 |
+
raise RuntimeError(f"LLM API response missing message content: {result}")
|
| 80 |
+
return content
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def get_response(prompt: str, model: str, temperature: float = 0.01, maximum_retries: int = 10) -> str:
|
| 84 |
+
"""
|
| 85 |
+
Send a chat completion request using OpenAI-compatible REST calls.
|
| 86 |
+
"""
|
| 87 |
+
if model.startswith("deepseek"):
|
| 88 |
+
real_model = model.replace("-chat", "-v3").replace("-reasoner", "-r1")
|
| 89 |
+
else:
|
| 90 |
+
real_model = model
|
| 91 |
+
|
| 92 |
+
attempts = max(1, maximum_retries)
|
| 93 |
+
last_error: Exception | None = None
|
| 94 |
+
while attempts > 0:
|
| 95 |
+
try:
|
| 96 |
+
result = _post_chat_completion(
|
| 97 |
+
messages=[{"role": "user", "content": prompt}],
|
| 98 |
+
model=real_model,
|
| 99 |
+
temperature=temperature,
|
| 100 |
+
max_tokens=16384,
|
| 101 |
+
)
|
| 102 |
+
return _extract_content(result)
|
| 103 |
+
except Exception as exc: # noqa: BLE001
|
| 104 |
+
last_error = exc
|
| 105 |
+
attempts -= 1
|
| 106 |
+
if attempts == 0:
|
| 107 |
+
break
|
| 108 |
+
print(f"Error using API: {exc}. Retrying...")
|
| 109 |
+
time.sleep(2)
|
| 110 |
+
|
| 111 |
+
raise RuntimeError(f"Failed to get response from API after retries: {last_error}")
|
src/debate_memory/memory_bank.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Memory Bank for storing and retrieving successful problem-solving cases
|
| 3 |
+
Uses LlamaIndex for RAG-based case retrieval
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage
|
| 11 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 12 |
+
from llama_index.core import Settings
|
| 13 |
+
|
| 14 |
+
_PKG_DIR = Path(__file__).resolve().parent
|
| 15 |
+
_PROJECT_ROOT = _PKG_DIR.parent.parent
|
| 16 |
+
DEFAULT_MEMORY_DIR = str(_PROJECT_ROOT / "memory_storage")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class MemoryBank:
|
| 20 |
+
"""
|
| 21 |
+
Memory Bank for storing successful problem-solving experiences
|
| 22 |
+
|
| 23 |
+
Design inspired by Memento (https://arxiv.org/pdf/2508.16153):
|
| 24 |
+
- Episodic memory: Store past successful trajectories
|
| 25 |
+
- Case-based reasoning: Retrieve similar cases to guide current problem
|
| 26 |
+
- Non-parametric: No gradient updates, just memory read/write
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, memory_dir: str = DEFAULT_MEMORY_DIR, embedding_model: str = "BAAI/bge-small-en-v1.5"):
|
| 30 |
+
"""
|
| 31 |
+
Initialize Memory Bank
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
memory_dir: Directory to store memory index and cases
|
| 35 |
+
embedding_model: HuggingFace embedding model name or local path
|
| 36 |
+
"""
|
| 37 |
+
self.memory_dir = memory_dir
|
| 38 |
+
os.makedirs(memory_dir, exist_ok=True)
|
| 39 |
+
|
| 40 |
+
self.cases_file = os.path.join(memory_dir, "cases.jsonl")
|
| 41 |
+
self.index_dir = os.path.join(memory_dir, "index")
|
| 42 |
+
|
| 43 |
+
# Configure embedding model with local caching
|
| 44 |
+
# Set cache_folder to use llama_index's cache directory
|
| 45 |
+
# Set trust_remote_code to False for security
|
| 46 |
+
# If embedding_model is a local path, use it directly
|
| 47 |
+
# Otherwise, try to use cached model to avoid network requests
|
| 48 |
+
os.environ.setdefault("HF_HUB_OFFLINE", "0") # Allow online access by default
|
| 49 |
+
|
| 50 |
+
# Check if embedding_model is a local file path
|
| 51 |
+
is_local_path = os.path.isabs(embedding_model) or (os.path.sep in embedding_model and os.path.exists(embedding_model))
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
# If it's a local path, use it directly
|
| 55 |
+
if is_local_path:
|
| 56 |
+
print(f"📁 Using local embedding model from: {embedding_model}")
|
| 57 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 58 |
+
model_name=embedding_model,
|
| 59 |
+
cache_folder=os.path.expanduser("~/.cache/llama_index"),
|
| 60 |
+
trust_remote_code=False
|
| 61 |
+
)
|
| 62 |
+
else:
|
| 63 |
+
# Try to load from cache first to avoid network requests
|
| 64 |
+
# Set HF_HUB_OFFLINE=1 to force local-only mode
|
| 65 |
+
print(f"🔍 Loading embedding model: {embedding_model}")
|
| 66 |
+
print(" (If you want to avoid Hugging Face downloads, set HF_HUB_OFFLINE=1 or use a local model path)")
|
| 67 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 68 |
+
model_name=embedding_model,
|
| 69 |
+
cache_folder=os.path.expanduser("~/.cache/llama_index"),
|
| 70 |
+
trust_remote_code=False
|
| 71 |
+
)
|
| 72 |
+
except Exception as e:
|
| 73 |
+
# If model loading fails, try to use cached model only
|
| 74 |
+
print(f"⚠️ Warning: Failed to load embedding model '{embedding_model}': {e}")
|
| 75 |
+
print(" Attempting to use cached model only (setting HF_HUB_OFFLINE=1)...")
|
| 76 |
+
os.environ["HF_HUB_OFFLINE"] = "1"
|
| 77 |
+
try:
|
| 78 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 79 |
+
model_name=embedding_model,
|
| 80 |
+
cache_folder=os.path.expanduser("~/.cache/llama_index"),
|
| 81 |
+
trust_remote_code=False
|
| 82 |
+
)
|
| 83 |
+
print(" ✅ Using cached model")
|
| 84 |
+
except Exception as e2:
|
| 85 |
+
print(f"❌ Error: Could not load embedding model: {e2}")
|
| 86 |
+
print(" Please either:")
|
| 87 |
+
print(" 1. Download the model first: python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')\"")
|
| 88 |
+
print(" 2. Set HF_HUB_OFFLINE=1 and ensure the model is cached")
|
| 89 |
+
print(" 3. Use a local model path: --embedding_model /path/to/local/model")
|
| 90 |
+
raise
|
| 91 |
+
# Disable chunking to ensure one document = one node (no duplicates)
|
| 92 |
+
Settings.chunk_size = 8192 # Large enough to never split
|
| 93 |
+
Settings.chunk_overlap = 0
|
| 94 |
+
|
| 95 |
+
# Load or create index
|
| 96 |
+
self.index = self._load_or_create_index()
|
| 97 |
+
self.case_count = self._count_cases()
|
| 98 |
+
|
| 99 |
+
print(f"Memory Bank initialized with {self.case_count} cases")
|
| 100 |
+
|
| 101 |
+
def _load_or_create_index(self):
|
| 102 |
+
"""Load existing index or create new one"""
|
| 103 |
+
if os.path.exists(self.index_dir):
|
| 104 |
+
try:
|
| 105 |
+
storage_context = StorageContext.from_defaults(persist_dir=self.index_dir)
|
| 106 |
+
index = load_index_from_storage(storage_context)
|
| 107 |
+
print(f"Loaded existing memory index from {self.index_dir}")
|
| 108 |
+
return index
|
| 109 |
+
except:
|
| 110 |
+
print("Failed to load index, creating new one")
|
| 111 |
+
|
| 112 |
+
# Create new empty index
|
| 113 |
+
documents = []
|
| 114 |
+
index = VectorStoreIndex.from_documents(documents)
|
| 115 |
+
os.makedirs(self.index_dir, exist_ok=True)
|
| 116 |
+
index.storage_context.persist(persist_dir=self.index_dir)
|
| 117 |
+
print(f"Created new memory index at {self.index_dir}")
|
| 118 |
+
return index
|
| 119 |
+
|
| 120 |
+
def _count_cases(self) -> int:
|
| 121 |
+
"""Count number of cases in memory"""
|
| 122 |
+
if not os.path.exists(self.cases_file):
|
| 123 |
+
return 0
|
| 124 |
+
with open(self.cases_file, 'r') as f:
|
| 125 |
+
return sum(1 for _ in f)
|
| 126 |
+
|
| 127 |
+
def add_case(self, problem_id: int, problem_desc: str, solution_code: str,
|
| 128 |
+
objective_value: float, is_correct: bool, metadata: Optional[Dict] = None):
|
| 129 |
+
"""
|
| 130 |
+
Add a successful case to memory
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
problem_id: Problem ID
|
| 134 |
+
problem_desc: Problem description
|
| 135 |
+
solution_code: Solution code
|
| 136 |
+
objective_value: Computed objective value
|
| 137 |
+
is_correct: Whether the solution is correct
|
| 138 |
+
metadata: Additional metadata (model, debate_rounds, etc.)
|
| 139 |
+
"""
|
| 140 |
+
if not is_correct:
|
| 141 |
+
# Only store successful cases
|
| 142 |
+
return
|
| 143 |
+
|
| 144 |
+
case = {
|
| 145 |
+
'problem_id': problem_id,
|
| 146 |
+
'description': problem_desc,
|
| 147 |
+
'solution_code': solution_code,
|
| 148 |
+
'objective_value': objective_value,
|
| 149 |
+
'is_correct': is_correct,
|
| 150 |
+
'metadata': metadata or {}
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
# Write to cases file
|
| 154 |
+
with open(self.cases_file, 'a', encoding='utf-8') as f:
|
| 155 |
+
f.write(json.dumps(case, ensure_ascii=False) + '\n')
|
| 156 |
+
|
| 157 |
+
# Create document for indexing
|
| 158 |
+
# Combine description and key solution insights for better retrieval
|
| 159 |
+
doc_text = f"""Problem: {problem_desc}
|
| 160 |
+
|
| 161 |
+
Solution approach:
|
| 162 |
+
{solution_code[:500]}...
|
| 163 |
+
|
| 164 |
+
Key features:
|
| 165 |
+
- Problem ID: {problem_id}
|
| 166 |
+
- Objective value: {objective_value}
|
| 167 |
+
- Status: Correct
|
| 168 |
+
"""
|
| 169 |
+
|
| 170 |
+
doc = Document(
|
| 171 |
+
text=doc_text,
|
| 172 |
+
metadata={
|
| 173 |
+
'problem_id': problem_id,
|
| 174 |
+
'objective_value': objective_value,
|
| 175 |
+
**case['metadata']
|
| 176 |
+
}
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Add to index
|
| 180 |
+
self.index.insert(doc)
|
| 181 |
+
self.index.storage_context.persist(persist_dir=self.index_dir)
|
| 182 |
+
|
| 183 |
+
self.case_count += 1
|
| 184 |
+
print(f"✅ Added case {problem_id} to memory (Total: {self.case_count})")
|
| 185 |
+
|
| 186 |
+
def retrieve_similar_cases(self, query: str, top_k: int = 3, preferred_dataset: Optional[str] = None) -> List[Dict]:
|
| 187 |
+
"""
|
| 188 |
+
Retrieve similar cases from memory using RAG based on semantic similarity
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
query: Query text (usually the problem description)
|
| 192 |
+
top_k: Number of similar cases to retrieve (0 = no retrieval)
|
| 193 |
+
preferred_dataset: Preferred dataset name to prioritize (optional)
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
List of similar cases with scores, sorted by semantic similarity
|
| 197 |
+
"""
|
| 198 |
+
if self.case_count == 0 or top_k <= 0:
|
| 199 |
+
return []
|
| 200 |
+
|
| 201 |
+
# Query the index - purely based on semantic similarity
|
| 202 |
+
retriever = self.index.as_retriever(similarity_top_k=top_k * 2 if preferred_dataset else top_k)
|
| 203 |
+
nodes = retriever.retrieve(query)
|
| 204 |
+
|
| 205 |
+
# Load corresponding cases from cases.jsonl based on semantic similarity
|
| 206 |
+
similar_cases = []
|
| 207 |
+
seen_keys = set() # Track which (problem_id, dataset) combinations we've added
|
| 208 |
+
|
| 209 |
+
# If preferred_dataset is specified, prioritize those cases
|
| 210 |
+
preferred_cases = []
|
| 211 |
+
other_cases = []
|
| 212 |
+
|
| 213 |
+
for node in nodes:
|
| 214 |
+
problem_id = node.metadata.get('problem_id')
|
| 215 |
+
score = node.score
|
| 216 |
+
node_dataset = node.metadata.get('dataset', '')
|
| 217 |
+
|
| 218 |
+
# Build key for deduplication
|
| 219 |
+
case_key = (problem_id, node_dataset)
|
| 220 |
+
if case_key in seen_keys:
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
# Load the case - use dataset from node metadata to get the exact match
|
| 224 |
+
case_data = None
|
| 225 |
+
if node_dataset:
|
| 226 |
+
# Try to load by problem_id and dataset (more precise)
|
| 227 |
+
case_data = self._load_case_by_id_and_dataset(problem_id, node_dataset)
|
| 228 |
+
|
| 229 |
+
if not case_data:
|
| 230 |
+
# Fallback: try to load by problem_id only
|
| 231 |
+
case_data = self._load_case_by_id(problem_id)
|
| 232 |
+
|
| 233 |
+
if case_data:
|
| 234 |
+
seen_keys.add(case_key)
|
| 235 |
+
case_item = {
|
| 236 |
+
'case': case_data,
|
| 237 |
+
'score': score,
|
| 238 |
+
'text_preview': node.text[:200]
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
# Separate preferred dataset cases from others
|
| 242 |
+
if preferred_dataset and node_dataset == preferred_dataset:
|
| 243 |
+
preferred_cases.append(case_item)
|
| 244 |
+
else:
|
| 245 |
+
other_cases.append(case_item)
|
| 246 |
+
|
| 247 |
+
# Combine: preferred cases first, then others, all sorted by similarity score
|
| 248 |
+
similar_cases = preferred_cases + other_cases
|
| 249 |
+
|
| 250 |
+
# Return top_k results
|
| 251 |
+
return similar_cases[:top_k]
|
| 252 |
+
|
| 253 |
+
def _load_case_by_id(self, problem_id: int) -> Optional[Dict]:
|
| 254 |
+
"""Load a specific case by problem ID (returns first match)"""
|
| 255 |
+
if not os.path.exists(self.cases_file):
|
| 256 |
+
return None
|
| 257 |
+
|
| 258 |
+
with open(self.cases_file, 'r', encoding='utf-8') as f:
|
| 259 |
+
for line in f:
|
| 260 |
+
case = json.loads(line)
|
| 261 |
+
if case['problem_id'] == problem_id:
|
| 262 |
+
return case
|
| 263 |
+
return None
|
| 264 |
+
|
| 265 |
+
def _load_case_by_id_and_dataset(self, problem_id: int, dataset: str) -> Optional[Dict]:
|
| 266 |
+
"""Load a specific case by problem ID and dataset"""
|
| 267 |
+
if not os.path.exists(self.cases_file):
|
| 268 |
+
return None
|
| 269 |
+
|
| 270 |
+
with open(self.cases_file, 'r', encoding='utf-8') as f:
|
| 271 |
+
for line in f:
|
| 272 |
+
case = json.loads(line)
|
| 273 |
+
if case['problem_id'] == problem_id:
|
| 274 |
+
case_dataset = case.get('metadata', {}).get('dataset', '')
|
| 275 |
+
if case_dataset == dataset:
|
| 276 |
+
return case
|
| 277 |
+
return None
|
| 278 |
+
|
| 279 |
+
def get_memory_stats(self) -> Dict:
|
| 280 |
+
"""Get memory bank statistics"""
|
| 281 |
+
return {
|
| 282 |
+
'total_cases': self.case_count,
|
| 283 |
+
'memory_dir': self.memory_dir,
|
| 284 |
+
'cases_file': self.cases_file,
|
| 285 |
+
'index_dir': self.index_dir
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
def format_retrieved_cases_for_prompt(self, cases: List[Dict]) -> str:
|
| 289 |
+
"""
|
| 290 |
+
Format retrieved cases for inclusion in LLM prompt
|
| 291 |
+
|
| 292 |
+
Args:
|
| 293 |
+
cases: List of retrieved cases
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
Formatted string for prompt
|
| 297 |
+
"""
|
| 298 |
+
if not cases:
|
| 299 |
+
return ""
|
| 300 |
+
|
| 301 |
+
prompt = "# Retrieved Similar Cases from Memory\n\n"
|
| 302 |
+
prompt += "The following successful cases from previous problems might be relevant:\n\n"
|
| 303 |
+
|
| 304 |
+
for i, item in enumerate(cases, 1):
|
| 305 |
+
case = item['case']
|
| 306 |
+
score = item['score']
|
| 307 |
+
|
| 308 |
+
prompt += f"## Case {i} (Similarity: {score:.3f})\n"
|
| 309 |
+
prompt += f"**Problem:** {case['description']}\n\n"
|
| 310 |
+
prompt += f"**Solution approach:**\n```python\n{case['solution_code']}\n```\n\n"
|
| 311 |
+
prompt += f"**Result:** Objective value = {case['objective_value']}, Status = Correct\n\n"
|
| 312 |
+
prompt += "---\n\n"
|
| 313 |
+
|
| 314 |
+
return prompt
|
| 315 |
+
|
| 316 |
+
|
src/debate_memory/memory_intelligence.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Lightweight helpers for categorising optimisation problems and surfacing
|
| 4 |
+
category-level memory.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import re
|
| 12 |
+
from collections import defaultdict
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, Iterable, List, Set, Tuple
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
_PKG_DIR = Path(__file__).resolve().parent
|
| 18 |
+
_PROJECT_ROOT = _PKG_DIR.parent.parent
|
| 19 |
+
DEFAULT_GUIDELINE_PATH = str(_PROJECT_ROOT / "memory_storage" / "category_guidelines.jsonl")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class MemoryIntelligence:
|
| 23 |
+
"""
|
| 24 |
+
Heuristic problem classifier + guideline loader.
|
| 25 |
+
|
| 26 |
+
The goal is to offer fast, rule-based categorisation that can run
|
| 27 |
+
offline. If the heuristics fail, downstream agents (LLMs) can still
|
| 28 |
+
append tags, but we always return the heuristic view for consistency.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
CATEGORY_KEYWORDS: Dict[str, Set[str]] = {
|
| 32 |
+
"workforce_planning": {
|
| 33 |
+
"worker",
|
| 34 |
+
"workforce",
|
| 35 |
+
"training",
|
| 36 |
+
"trainee",
|
| 37 |
+
"overtime",
|
| 38 |
+
"hire",
|
| 39 |
+
"fire",
|
| 40 |
+
},
|
| 41 |
+
"inventory_planning": {
|
| 42 |
+
"inventory",
|
| 43 |
+
"backlog",
|
| 44 |
+
"stock",
|
| 45 |
+
"warehouse",
|
| 46 |
+
"storage",
|
| 47 |
+
"holding cost",
|
| 48 |
+
},
|
| 49 |
+
"production_planning": {
|
| 50 |
+
"production",
|
| 51 |
+
"factory",
|
| 52 |
+
"capacity",
|
| 53 |
+
"machine",
|
| 54 |
+
"batch",
|
| 55 |
+
"demand",
|
| 56 |
+
},
|
| 57 |
+
"scheduling": {
|
| 58 |
+
"schedule",
|
| 59 |
+
"sequencing",
|
| 60 |
+
"precedence",
|
| 61 |
+
"flow shop",
|
| 62 |
+
"job shop",
|
| 63 |
+
"makespan",
|
| 64 |
+
},
|
| 65 |
+
"transportation": {
|
| 66 |
+
"transport",
|
| 67 |
+
"shipping",
|
| 68 |
+
"vehicle",
|
| 69 |
+
"route",
|
| 70 |
+
"delivery",
|
| 71 |
+
"supply",
|
| 72 |
+
"demand",
|
| 73 |
+
"shipment",
|
| 74 |
+
},
|
| 75 |
+
"network_flow": {
|
| 76 |
+
"flow",
|
| 77 |
+
"arc",
|
| 78 |
+
"network",
|
| 79 |
+
"node",
|
| 80 |
+
"capacity",
|
| 81 |
+
"supply node",
|
| 82 |
+
"demand node",
|
| 83 |
+
},
|
| 84 |
+
"assignment": {
|
| 85 |
+
"assignment",
|
| 86 |
+
"allocate",
|
| 87 |
+
"task",
|
| 88 |
+
"agent",
|
| 89 |
+
"matching",
|
| 90 |
+
"job",
|
| 91 |
+
},
|
| 92 |
+
"facility_location": {
|
| 93 |
+
"facility",
|
| 94 |
+
"location",
|
| 95 |
+
"plant",
|
| 96 |
+
"open",
|
| 97 |
+
"siting",
|
| 98 |
+
"distribution center",
|
| 99 |
+
},
|
| 100 |
+
"traveling_salesman": {
|
| 101 |
+
"tsp",
|
| 102 |
+
"tour",
|
| 103 |
+
"city",
|
| 104 |
+
"travel",
|
| 105 |
+
"route visiting",
|
| 106 |
+
"cyclic",
|
| 107 |
+
},
|
| 108 |
+
"portfolio_optimization": {
|
| 109 |
+
"portfolio",
|
| 110 |
+
"investment",
|
| 111 |
+
"asset",
|
| 112 |
+
"return",
|
| 113 |
+
"risk",
|
| 114 |
+
"variance",
|
| 115 |
+
},
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
def __init__(self, guideline_path: str = DEFAULT_GUIDELINE_PATH):
|
| 119 |
+
self.guideline_path = guideline_path
|
| 120 |
+
self.guidelines = self._load_guidelines(guideline_path)
|
| 121 |
+
|
| 122 |
+
@staticmethod
|
| 123 |
+
def _load_guidelines(path: str) -> Dict[str, Dict]:
|
| 124 |
+
guidelines: Dict[str, Dict] = {}
|
| 125 |
+
if not path or not os.path.exists(path):
|
| 126 |
+
return guidelines
|
| 127 |
+
with open(path, "r", encoding="utf-8") as fh:
|
| 128 |
+
for line in fh:
|
| 129 |
+
line = line.strip()
|
| 130 |
+
if not line:
|
| 131 |
+
continue
|
| 132 |
+
try:
|
| 133 |
+
payload = json.loads(line)
|
| 134 |
+
except json.JSONDecodeError:
|
| 135 |
+
continue
|
| 136 |
+
category = payload.get("category")
|
| 137 |
+
if not category:
|
| 138 |
+
continue
|
| 139 |
+
guidelines[category] = payload
|
| 140 |
+
return guidelines
|
| 141 |
+
|
| 142 |
+
def classify(self, description: str, top_k: int = 3, minimum_score: int = 1) -> List[Tuple[str, int]]:
|
| 143 |
+
"""
|
| 144 |
+
Return a ranked list of (category, score) using keyword heuristics.
|
| 145 |
+
"""
|
| 146 |
+
if not description:
|
| 147 |
+
return []
|
| 148 |
+
text = description.lower()
|
| 149 |
+
scores: Dict[str, int] = defaultdict(int)
|
| 150 |
+
for category, keywords in self.CATEGORY_KEYWORDS.items():
|
| 151 |
+
for keyword in keywords:
|
| 152 |
+
occurrences = len(re.findall(r"\b" + re.escape(keyword.lower()) + r"\b", text))
|
| 153 |
+
if occurrences:
|
| 154 |
+
scores[category] += occurrences
|
| 155 |
+
ranked = sorted(scores.items(), key=lambda item: item[1], reverse=True)
|
| 156 |
+
filtered = [(cat, score) for cat, score in ranked if score >= minimum_score]
|
| 157 |
+
if top_k:
|
| 158 |
+
return filtered[:top_k]
|
| 159 |
+
return filtered
|
| 160 |
+
|
| 161 |
+
def categories_only(self, description: str, top_k: int = 3, minimum_score: int = 1) -> List[str]:
|
| 162 |
+
return [cat for cat, _ in self.classify(description, top_k=top_k, minimum_score=minimum_score)]
|
| 163 |
+
|
| 164 |
+
def guideline_text(
|
| 165 |
+
self,
|
| 166 |
+
categories: Iterable[str],
|
| 167 |
+
include_header: bool = True,
|
| 168 |
+
max_items_per_category: int = 4,
|
| 169 |
+
) -> str:
|
| 170 |
+
"""
|
| 171 |
+
Render guidelines for the provided categories as a markdown string.
|
| 172 |
+
"""
|
| 173 |
+
categories = list(dict.fromkeys(categories)) # deduplicate while preserving order
|
| 174 |
+
if not categories:
|
| 175 |
+
return ""
|
| 176 |
+
|
| 177 |
+
lines: List[str] = []
|
| 178 |
+
if include_header:
|
| 179 |
+
lines.append("# Category Playbook")
|
| 180 |
+
lines.append("")
|
| 181 |
+
|
| 182 |
+
for category in categories:
|
| 183 |
+
entry = self.guidelines.get(category)
|
| 184 |
+
if not entry:
|
| 185 |
+
continue
|
| 186 |
+
title = entry.get("title") or category.replace("_", " ").title()
|
| 187 |
+
lines.append(f"## {title}")
|
| 188 |
+
guidelines = entry.get("guidelines") or []
|
| 189 |
+
if not guidelines:
|
| 190 |
+
continue
|
| 191 |
+
for bullet in guidelines[:max_items_per_category]:
|
| 192 |
+
lines.append(f"- {bullet}")
|
| 193 |
+
lines.append("")
|
| 194 |
+
|
| 195 |
+
return "\n".join(lines).strip()
|
| 196 |
+
|
| 197 |
+
def guideline_bullets(self, categories: Iterable[str], max_items_per_category: int = 4) -> List[str]:
|
| 198 |
+
bullets: List[str] = []
|
| 199 |
+
for category in categories:
|
| 200 |
+
entry = self.guidelines.get(category)
|
| 201 |
+
if not entry:
|
| 202 |
+
continue
|
| 203 |
+
title = entry.get("title") or category.replace("_", " ").title()
|
| 204 |
+
guidelines = entry.get("guidelines") or []
|
| 205 |
+
for item in guidelines[:max_items_per_category]:
|
| 206 |
+
bullets.append(f"{title}: {item}")
|
| 207 |
+
return bullets
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
__all__ = ["MemoryIntelligence", "DEFAULT_GUIDELINE_PATH"]
|
src/debate_memory/run_memory_debate.py
ADDED
|
@@ -0,0 +1,580 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Run debates between two models using memory-augmented single generations.
|
| 4 |
+
|
| 5 |
+
This script automatically locates the latest initial-solution files for the
|
| 6 |
+
specified models, runs the parallel debate workflow from `simple_rag/debate.py`,
|
| 7 |
+
and then evaluates the consensus solutions with `execute.py`.
|
| 8 |
+
|
| 9 |
+
Example:
|
| 10 |
+
python run_memory_debate.py \
|
| 11 |
+
--datasets ComplexLP EasyLP \
|
| 12 |
+
--max_rounds 3 \
|
| 13 |
+
--debate_workers 16 \
|
| 14 |
+
--execute_workers 128
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import argparse
|
| 19 |
+
import datetime as dt
|
| 20 |
+
import glob
|
| 21 |
+
import json
|
| 22 |
+
import os
|
| 23 |
+
import re
|
| 24 |
+
import subprocess
|
| 25 |
+
import sys
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
from typing import Dict, List, Optional, Tuple
|
| 28 |
+
|
| 29 |
+
from .config import normalize_dataset_name
|
| 30 |
+
|
| 31 |
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 32 |
+
SRC_DIR = SCRIPT_DIR.parent
|
| 33 |
+
PROJECT_ROOT = SRC_DIR.parent
|
| 34 |
+
MONOREPO_ROOT = PROJECT_ROOT.parent
|
| 35 |
+
STANDARD_RESULTS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt"
|
| 36 |
+
DEFAULT_RESULTS_DIR = STANDARD_RESULTS_ROOT / "generation"
|
| 37 |
+
DEFAULT_OUTPUT_ROOT = STANDARD_RESULTS_ROOT / "debate"
|
| 38 |
+
DEFAULT_DEBATE_SCRIPT = MONOREPO_ROOT / "simple_rag" / "debate.py"
|
| 39 |
+
DEFAULT_EXECUTE_SCRIPT = PROJECT_ROOT / "scripts" / "execute.py"
|
| 40 |
+
DEFAULT_DEBATE_MEMORY_DIR = PROJECT_ROOT / "debate_memory_storage"
|
| 41 |
+
DEBATE_MEMORY_HEADER = "# Debate Memory Insights"
|
| 42 |
+
|
| 43 |
+
from .memory_bank import MemoryBank
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def format_debate_memory_context(cases: List[Dict]) -> str:
|
| 47 |
+
if not cases:
|
| 48 |
+
return ""
|
| 49 |
+
lines = [DEBATE_MEMORY_HEADER, ""]
|
| 50 |
+
for idx, item in enumerate(cases, 1):
|
| 51 |
+
case = item["case"]
|
| 52 |
+
score = item.get("score", 0.0)
|
| 53 |
+
metadata = case.get("metadata", {})
|
| 54 |
+
dataset = metadata.get("dataset", "unknown")
|
| 55 |
+
summary = metadata.get("summary", {}).get("summary")
|
| 56 |
+
lines.append(f"## Case {idx} (similarity {score:.3f}, dataset {dataset})")
|
| 57 |
+
description = case.get("description", "").strip()
|
| 58 |
+
if description:
|
| 59 |
+
snippet = description if len(description) <= 800 else description[:800] + "\n..."
|
| 60 |
+
lines.append(snippet)
|
| 61 |
+
if summary:
|
| 62 |
+
lines.append("Summary: " + summary)
|
| 63 |
+
lines.append("---")
|
| 64 |
+
return "\n".join(lines).strip()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def build_debate_memory_contexts(
|
| 68 |
+
files: List[str],
|
| 69 |
+
debate_memory: MemoryBank,
|
| 70 |
+
dataset: str,
|
| 71 |
+
top_k: int,
|
| 72 |
+
) -> Dict[int, str]:
|
| 73 |
+
contexts: Dict[int, str] = {}
|
| 74 |
+
if debate_memory is None or top_k <= 0:
|
| 75 |
+
return contexts
|
| 76 |
+
for file_path in files:
|
| 77 |
+
with open(file_path, "r", encoding="utf-8") as fh:
|
| 78 |
+
for line in fh:
|
| 79 |
+
if not line.strip():
|
| 80 |
+
continue
|
| 81 |
+
data = json.loads(line)
|
| 82 |
+
problem_id = data.get("id")
|
| 83 |
+
if problem_id is None or problem_id in contexts:
|
| 84 |
+
continue
|
| 85 |
+
description = data.get("description", "")
|
| 86 |
+
if not description.strip():
|
| 87 |
+
contexts[problem_id] = ""
|
| 88 |
+
continue
|
| 89 |
+
cases = debate_memory.retrieve_similar_cases(
|
| 90 |
+
description,
|
| 91 |
+
top_k=top_k,
|
| 92 |
+
preferred_dataset=dataset,
|
| 93 |
+
)
|
| 94 |
+
contexts[problem_id] = format_debate_memory_context(cases)
|
| 95 |
+
return contexts
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def maybe_enrich_generation_file(
|
| 99 |
+
source_path: str,
|
| 100 |
+
destination_path: str,
|
| 101 |
+
contexts: Dict[int, str],
|
| 102 |
+
) -> str:
|
| 103 |
+
if not contexts:
|
| 104 |
+
return source_path
|
| 105 |
+
changed = False
|
| 106 |
+
enriched_lines: List[str] = []
|
| 107 |
+
with open(source_path, "r", encoding="utf-8") as fh:
|
| 108 |
+
for line in fh:
|
| 109 |
+
if not line.strip():
|
| 110 |
+
continue
|
| 111 |
+
data = json.loads(line)
|
| 112 |
+
pid = data.get("id")
|
| 113 |
+
context = contexts.get(pid)
|
| 114 |
+
if context:
|
| 115 |
+
data["description"] = f"{data.get('description', '').strip()}\n\n{context}"
|
| 116 |
+
changed = True
|
| 117 |
+
enriched_lines.append(json.dumps(data, ensure_ascii=False))
|
| 118 |
+
if not changed:
|
| 119 |
+
return source_path
|
| 120 |
+
with open(destination_path, "w", encoding="utf-8") as fh:
|
| 121 |
+
for entry in enriched_lines:
|
| 122 |
+
fh.write(entry + "\n")
|
| 123 |
+
return destination_path
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def parse_args() -> argparse.Namespace:
|
| 127 |
+
parser = argparse.ArgumentParser(
|
| 128 |
+
description="Parallel debate runner for memory-enhanced single generations"
|
| 129 |
+
)
|
| 130 |
+
parser.add_argument(
|
| 131 |
+
"--modelA",
|
| 132 |
+
type=str,
|
| 133 |
+
default="gpt-4o",
|
| 134 |
+
help="First model in the debate (default: gpt-4o)",
|
| 135 |
+
)
|
| 136 |
+
parser.add_argument(
|
| 137 |
+
"--modelB",
|
| 138 |
+
type=str,
|
| 139 |
+
default="deepseek-chat",
|
| 140 |
+
help="Second model in the debate (default: deepseek-chat)",
|
| 141 |
+
)
|
| 142 |
+
parser.add_argument(
|
| 143 |
+
"--results_dir",
|
| 144 |
+
type=str,
|
| 145 |
+
default=str(DEFAULT_RESULTS_DIR),
|
| 146 |
+
help="Directory that stores initial-solution JSONL files",
|
| 147 |
+
)
|
| 148 |
+
parser.add_argument(
|
| 149 |
+
"--datasets",
|
| 150 |
+
nargs="*",
|
| 151 |
+
default=None,
|
| 152 |
+
help="Datasets to debate. If omitted, auto-detect common datasets.",
|
| 153 |
+
)
|
| 154 |
+
parser.add_argument(
|
| 155 |
+
"--output_root",
|
| 156 |
+
type=str,
|
| 157 |
+
default=str(DEFAULT_OUTPUT_ROOT),
|
| 158 |
+
help="Root directory to store debate/eval outputs",
|
| 159 |
+
)
|
| 160 |
+
parser.add_argument(
|
| 161 |
+
"--debate_script",
|
| 162 |
+
type=str,
|
| 163 |
+
default=str(DEFAULT_DEBATE_SCRIPT),
|
| 164 |
+
help="Path to simple_rag/debate.py (override if needed)",
|
| 165 |
+
)
|
| 166 |
+
parser.add_argument(
|
| 167 |
+
"--execute_script",
|
| 168 |
+
type=str,
|
| 169 |
+
default=str(DEFAULT_EXECUTE_SCRIPT),
|
| 170 |
+
help="Path to debate_with_memory/execute.py (override if needed)",
|
| 171 |
+
)
|
| 172 |
+
parser.add_argument(
|
| 173 |
+
"--max_rounds",
|
| 174 |
+
type=int,
|
| 175 |
+
default=3,
|
| 176 |
+
help="Maximum number of debate rounds (default: 3)",
|
| 177 |
+
)
|
| 178 |
+
parser.add_argument(
|
| 179 |
+
"--temperature",
|
| 180 |
+
type=float,
|
| 181 |
+
default=0.01,
|
| 182 |
+
help="Temperature for debate LLM calls (default: 0.01)",
|
| 183 |
+
)
|
| 184 |
+
parser.add_argument(
|
| 185 |
+
"--debate_workers",
|
| 186 |
+
type=int,
|
| 187 |
+
default=16,
|
| 188 |
+
help="Parallel workers for debate (ThreadPool inside debate.py)",
|
| 189 |
+
)
|
| 190 |
+
parser.add_argument(
|
| 191 |
+
"--execute_workers",
|
| 192 |
+
type=int,
|
| 193 |
+
default=128,
|
| 194 |
+
help="Parallel workers for execute.py evaluation",
|
| 195 |
+
)
|
| 196 |
+
parser.add_argument(
|
| 197 |
+
"--max_problems",
|
| 198 |
+
type=int,
|
| 199 |
+
default=None,
|
| 200 |
+
help="Optional cap on number of problems per dataset",
|
| 201 |
+
)
|
| 202 |
+
parser.add_argument(
|
| 203 |
+
"--tolerance",
|
| 204 |
+
type=float,
|
| 205 |
+
default=0.05,
|
| 206 |
+
help="Relative tolerance for evaluation accuracy comparison",
|
| 207 |
+
)
|
| 208 |
+
parser.add_argument(
|
| 209 |
+
"--timeout",
|
| 210 |
+
type=int,
|
| 211 |
+
default=90,
|
| 212 |
+
help="Timeout (seconds) for executing consensus code",
|
| 213 |
+
)
|
| 214 |
+
parser.add_argument(
|
| 215 |
+
"--relative_tolerance",
|
| 216 |
+
action="store_true",
|
| 217 |
+
help="Pass --use_relative_tolerance to execute.py",
|
| 218 |
+
)
|
| 219 |
+
parser.add_argument(
|
| 220 |
+
"--save_execution_stdout",
|
| 221 |
+
action="store_true",
|
| 222 |
+
help="Store stdout/stderr for consensus executions",
|
| 223 |
+
)
|
| 224 |
+
parser.add_argument(
|
| 225 |
+
"--execute_memory_dir",
|
| 226 |
+
type=str,
|
| 227 |
+
default=None,
|
| 228 |
+
help="Optional memory_storage directory forwarded to execute.py during consensus evaluation.",
|
| 229 |
+
)
|
| 230 |
+
parser.add_argument(
|
| 231 |
+
"--execute_debug_memory_path",
|
| 232 |
+
type=str,
|
| 233 |
+
default=None,
|
| 234 |
+
help="Optional debug_memory.jsonl path forwarded to execute.py during consensus evaluation.",
|
| 235 |
+
)
|
| 236 |
+
parser.add_argument(
|
| 237 |
+
"--execute_disable_debug_memory",
|
| 238 |
+
action="store_true",
|
| 239 |
+
help="Pass --disable_debug_memory to execute.py during consensus evaluation.",
|
| 240 |
+
)
|
| 241 |
+
parser.add_argument(
|
| 242 |
+
"--dry_run",
|
| 243 |
+
action="store_true",
|
| 244 |
+
help="Only print the planned actions without running debate/eval",
|
| 245 |
+
)
|
| 246 |
+
parser.add_argument(
|
| 247 |
+
"--debate_memory_dir",
|
| 248 |
+
type=str,
|
| 249 |
+
default=str(DEFAULT_DEBATE_MEMORY_DIR),
|
| 250 |
+
help="Directory containing debate memory cases for prompt augmentation",
|
| 251 |
+
)
|
| 252 |
+
parser.add_argument(
|
| 253 |
+
"--debate_memory_top_k",
|
| 254 |
+
type=int,
|
| 255 |
+
default=2,
|
| 256 |
+
help="How many debate memory cases to retrieve per problem",
|
| 257 |
+
)
|
| 258 |
+
parser.add_argument(
|
| 259 |
+
"--disable_debate_memory",
|
| 260 |
+
action="store_true",
|
| 261 |
+
help="Skip retrieval even if debate memory directory exists",
|
| 262 |
+
)
|
| 263 |
+
parser.add_argument(
|
| 264 |
+
"--embedding_model",
|
| 265 |
+
type=str,
|
| 266 |
+
default=None,
|
| 267 |
+
help="Embedding model name or local path (default: BAAI/bge-small-en-v1.5). "
|
| 268 |
+
"Use local path to avoid Hugging Face downloads, or set HF_HUB_OFFLINE=1 environment variable.",
|
| 269 |
+
)
|
| 270 |
+
return parser.parse_args()
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def normalize_dataset_list(raw_list: Optional[List[str]]) -> Optional[List[str]]:
|
| 274 |
+
"""Split comma-separated values and strip whitespace."""
|
| 275 |
+
if not raw_list:
|
| 276 |
+
return None
|
| 277 |
+
datasets: List[str] = []
|
| 278 |
+
for item in raw_list:
|
| 279 |
+
parts = [part.strip() for part in item.split(",") if part.strip()]
|
| 280 |
+
datasets.extend(normalize_dataset_name(part) for part in parts)
|
| 281 |
+
return datasets or None
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def collect_runs(results_dir: str, model: str) -> Dict[str, List[Tuple[str, str]]]:
|
| 285 |
+
"""
|
| 286 |
+
Return mapping dataset -> list of (timestamp, path) sorted ascending.
|
| 287 |
+
Skips evaluation artifacts (suffixes containing '_eval').
|
| 288 |
+
"""
|
| 289 |
+
pattern = os.path.join(results_dir, f"{model}_*.jsonl")
|
| 290 |
+
regex = re.compile(rf"{re.escape(model)}_(.+)_(\d{{8}}_\d{{6}})\.jsonl$")
|
| 291 |
+
runs: Dict[str, List[Tuple[str, str]]] = {}
|
| 292 |
+
|
| 293 |
+
for path in glob.glob(pattern):
|
| 294 |
+
base = os.path.basename(path)
|
| 295 |
+
match = regex.match(base)
|
| 296 |
+
if not match:
|
| 297 |
+
continue
|
| 298 |
+
dataset = normalize_dataset_name(match.group(1))
|
| 299 |
+
if "_eval" in dataset:
|
| 300 |
+
continue
|
| 301 |
+
timestamp = match.group(2)
|
| 302 |
+
runs.setdefault(dataset, []).append((timestamp, path))
|
| 303 |
+
|
| 304 |
+
for dataset in runs:
|
| 305 |
+
runs[dataset].sort() # chronological
|
| 306 |
+
|
| 307 |
+
return runs
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def pick_latest(runs: Dict[str, List[Tuple[str, str]]], dataset: str) -> Optional[str]:
|
| 311 |
+
"""Return latest file path for dataset if available."""
|
| 312 |
+
entries = runs.get(dataset)
|
| 313 |
+
if not entries:
|
| 314 |
+
return None
|
| 315 |
+
return entries[-1][1]
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def stream_command(cmd: List[str], cwd: str, log_path: str) -> None:
|
| 319 |
+
"""Run a subprocess, streaming output to stdout and a log file."""
|
| 320 |
+
print(f"\n▶ Running: {' '.join(cmd)}", flush=True)
|
| 321 |
+
print(f" cwd: {cwd}", flush=True)
|
| 322 |
+
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
| 323 |
+
|
| 324 |
+
with open(log_path, "w", encoding="utf-8") as log_file:
|
| 325 |
+
process = subprocess.Popen(
|
| 326 |
+
cmd,
|
| 327 |
+
cwd=cwd,
|
| 328 |
+
stdout=subprocess.PIPE,
|
| 329 |
+
stderr=subprocess.STDOUT,
|
| 330 |
+
text=True,
|
| 331 |
+
encoding="utf-8",
|
| 332 |
+
errors="replace",
|
| 333 |
+
bufsize=1,
|
| 334 |
+
)
|
| 335 |
+
assert process.stdout is not None # for type checkers
|
| 336 |
+
for line in process.stdout:
|
| 337 |
+
print(line, end="", flush=True)
|
| 338 |
+
log_file.write(line)
|
| 339 |
+
log_file.flush()
|
| 340 |
+
return_code = process.wait()
|
| 341 |
+
|
| 342 |
+
if return_code != 0:
|
| 343 |
+
raise subprocess.CalledProcessError(return_code, cmd)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def load_eval_report(report_path: str) -> Optional[Dict]:
|
| 347 |
+
if not os.path.exists(report_path):
|
| 348 |
+
return None
|
| 349 |
+
with open(report_path, "r", encoding="utf-8") as fh:
|
| 350 |
+
return json.load(fh)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def ensure_script(path: str, description: str) -> None:
|
| 354 |
+
if not os.path.isfile(path):
|
| 355 |
+
raise FileNotFoundError(f"{description} not found: {path}")
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def main() -> None:
|
| 359 |
+
args = parse_args()
|
| 360 |
+
args.datasets = normalize_dataset_list(args.datasets)
|
| 361 |
+
args.output_root = os.path.abspath(args.output_root)
|
| 362 |
+
args.results_dir = os.path.abspath(args.results_dir)
|
| 363 |
+
|
| 364 |
+
debate_memory_bank: Optional[MemoryBank] = None
|
| 365 |
+
if not args.disable_debate_memory and args.debate_memory_dir:
|
| 366 |
+
debate_memory_path = Path(args.debate_memory_dir)
|
| 367 |
+
if debate_memory_path.exists():
|
| 368 |
+
try:
|
| 369 |
+
embedding_model = args.embedding_model if args.embedding_model else "BAAI/bge-small-en-v1.5"
|
| 370 |
+
debate_memory_bank = MemoryBank(
|
| 371 |
+
memory_dir=str(debate_memory_path),
|
| 372 |
+
embedding_model=embedding_model
|
| 373 |
+
)
|
| 374 |
+
except Exception as exc: # noqa: BLE001
|
| 375 |
+
print(f"⚠️ Warning: failed to load debate memory from {debate_memory_path}: {exc}")
|
| 376 |
+
else:
|
| 377 |
+
print(f"ℹ️ Debate memory directory not found: {debate_memory_path} (skipping context retrieval)")
|
| 378 |
+
|
| 379 |
+
ensure_script(args.debate_script, "Debate script")
|
| 380 |
+
ensure_script(args.execute_script, "Execute script")
|
| 381 |
+
|
| 382 |
+
modelA_runs = collect_runs(args.results_dir, args.modelA)
|
| 383 |
+
modelB_runs = collect_runs(args.results_dir, args.modelB)
|
| 384 |
+
|
| 385 |
+
if args.datasets:
|
| 386 |
+
datasets = args.datasets
|
| 387 |
+
else:
|
| 388 |
+
datasets = sorted(set(modelA_runs.keys()) & set(modelB_runs.keys()))
|
| 389 |
+
|
| 390 |
+
if not datasets:
|
| 391 |
+
print("❌ No common datasets with available runs were found.")
|
| 392 |
+
sys.exit(1)
|
| 393 |
+
|
| 394 |
+
print("=" * 80)
|
| 395 |
+
print("🧠 Memory-Based Debate Runner")
|
| 396 |
+
print("=" * 80)
|
| 397 |
+
print(f"Model A: {args.modelA}")
|
| 398 |
+
print(f"Model B: {args.modelB}")
|
| 399 |
+
print(f"Datasets: {', '.join(datasets)}")
|
| 400 |
+
print(f"Results dir: {args.results_dir}")
|
| 401 |
+
print(f"Output root: {args.output_root}")
|
| 402 |
+
print(f"Debate workers: {args.debate_workers} (parallel)")
|
| 403 |
+
print("=" * 80)
|
| 404 |
+
|
| 405 |
+
timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 406 |
+
dataset_summaries: List[Dict] = []
|
| 407 |
+
processed = 0
|
| 408 |
+
|
| 409 |
+
for dataset in datasets:
|
| 410 |
+
file_a = pick_latest(modelA_runs, dataset)
|
| 411 |
+
file_b = pick_latest(modelB_runs, dataset)
|
| 412 |
+
|
| 413 |
+
if not file_a or not file_b:
|
| 414 |
+
print(f"⚠️ Skipping {dataset}: missing runs for one of the models.")
|
| 415 |
+
dataset_summaries.append(
|
| 416 |
+
{
|
| 417 |
+
"dataset": dataset,
|
| 418 |
+
"status": "missing_runs",
|
| 419 |
+
"modelA_file": file_a,
|
| 420 |
+
"modelB_file": file_b,
|
| 421 |
+
}
|
| 422 |
+
)
|
| 423 |
+
continue
|
| 424 |
+
|
| 425 |
+
run_dir = os.path.join(
|
| 426 |
+
args.output_root,
|
| 427 |
+
dataset,
|
| 428 |
+
f"{timestamp}_{args.modelA}_vs_{args.modelB}",
|
| 429 |
+
)
|
| 430 |
+
os.makedirs(run_dir, exist_ok=True)
|
| 431 |
+
|
| 432 |
+
print(f"\n{'=' * 80}")
|
| 433 |
+
print(f"🚀 Dataset: {dataset}")
|
| 434 |
+
print(f" Model A file: {file_a}")
|
| 435 |
+
print(f" Model B file: {file_b}")
|
| 436 |
+
print(f" Output dir: {run_dir}")
|
| 437 |
+
print(f"{'=' * 80}")
|
| 438 |
+
|
| 439 |
+
file_a_for_debate = file_a
|
| 440 |
+
file_b_for_debate = file_b
|
| 441 |
+
if not args.dry_run and debate_memory_bank and args.debate_memory_top_k > 0:
|
| 442 |
+
contexts = build_debate_memory_contexts(
|
| 443 |
+
[file_a, file_b], debate_memory_bank, dataset, args.debate_memory_top_k
|
| 444 |
+
)
|
| 445 |
+
if any(contexts.values()):
|
| 446 |
+
print(" 🧠 Injecting debate memory context into prompts")
|
| 447 |
+
enriched_a = os.path.join(
|
| 448 |
+
run_dir, f"{os.path.basename(file_a)}.debate_memory.jsonl"
|
| 449 |
+
)
|
| 450 |
+
enriched_b = os.path.join(
|
| 451 |
+
run_dir, f"{os.path.basename(file_b)}.debate_memory.jsonl"
|
| 452 |
+
)
|
| 453 |
+
file_a_for_debate = maybe_enrich_generation_file(file_a, enriched_a, contexts)
|
| 454 |
+
file_b_for_debate = maybe_enrich_generation_file(file_b, enriched_b, contexts)
|
| 455 |
+
|
| 456 |
+
if args.dry_run:
|
| 457 |
+
print("Dry-run mode → skipping actual execution.")
|
| 458 |
+
dataset_summaries.append(
|
| 459 |
+
{
|
| 460 |
+
"dataset": dataset,
|
| 461 |
+
"status": "dry_run",
|
| 462 |
+
"debate_dir": run_dir,
|
| 463 |
+
"modelA_file": file_a,
|
| 464 |
+
"modelB_file": file_b,
|
| 465 |
+
}
|
| 466 |
+
)
|
| 467 |
+
continue
|
| 468 |
+
|
| 469 |
+
# 1) Run debate
|
| 470 |
+
debate_cmd = [
|
| 471 |
+
sys.executable,
|
| 472 |
+
"-u",
|
| 473 |
+
args.debate_script,
|
| 474 |
+
"--resultA",
|
| 475 |
+
file_a_for_debate,
|
| 476 |
+
"--resultB",
|
| 477 |
+
file_b_for_debate,
|
| 478 |
+
"--modelA",
|
| 479 |
+
args.modelA,
|
| 480 |
+
"--modelB",
|
| 481 |
+
args.modelB,
|
| 482 |
+
"--save_dir",
|
| 483 |
+
run_dir,
|
| 484 |
+
"--max_rounds",
|
| 485 |
+
str(args.max_rounds),
|
| 486 |
+
"--temperature",
|
| 487 |
+
str(args.temperature),
|
| 488 |
+
"--num_workers",
|
| 489 |
+
str(args.debate_workers),
|
| 490 |
+
]
|
| 491 |
+
if args.max_problems is not None:
|
| 492 |
+
debate_cmd.extend(["--max_problems", str(args.max_problems)])
|
| 493 |
+
|
| 494 |
+
debate_log = os.path.join(run_dir, "debate.log")
|
| 495 |
+
stream_command(debate_cmd, cwd=str(MONOREPO_ROOT), log_path=debate_log)
|
| 496 |
+
|
| 497 |
+
consensus_file = os.path.join(
|
| 498 |
+
run_dir, f"consensus_{args.modelA}_vs_{args.modelB}.jsonl"
|
| 499 |
+
)
|
| 500 |
+
if not os.path.exists(consensus_file):
|
| 501 |
+
raise FileNotFoundError(
|
| 502 |
+
f"Consensus file not found after debate: {consensus_file}"
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
# 2) Evaluate consensus
|
| 506 |
+
eval_dir = os.path.join(run_dir, "eval_consensus")
|
| 507 |
+
eval_cmd = [
|
| 508 |
+
sys.executable,
|
| 509 |
+
"-u",
|
| 510 |
+
args.execute_script,
|
| 511 |
+
"--input_file",
|
| 512 |
+
consensus_file,
|
| 513 |
+
"--output_dir",
|
| 514 |
+
eval_dir,
|
| 515 |
+
"--timeout",
|
| 516 |
+
str(args.timeout),
|
| 517 |
+
"--tolerance",
|
| 518 |
+
str(args.tolerance),
|
| 519 |
+
"--num_workers",
|
| 520 |
+
str(args.execute_workers),
|
| 521 |
+
]
|
| 522 |
+
if args.relative_tolerance:
|
| 523 |
+
eval_cmd.append("--use_relative_tolerance")
|
| 524 |
+
if args.save_execution_stdout:
|
| 525 |
+
eval_cmd.append("--save_output")
|
| 526 |
+
if args.execute_memory_dir:
|
| 527 |
+
eval_cmd.extend(["--memory_dir", args.execute_memory_dir])
|
| 528 |
+
if args.execute_debug_memory_path:
|
| 529 |
+
eval_cmd.extend(["--debug_memory_path", args.execute_debug_memory_path])
|
| 530 |
+
if args.execute_disable_debug_memory:
|
| 531 |
+
eval_cmd.append("--disable_debug_memory")
|
| 532 |
+
if args.embedding_model:
|
| 533 |
+
eval_cmd.extend(["--embedding_model", args.embedding_model])
|
| 534 |
+
|
| 535 |
+
eval_log = os.path.join(run_dir, "evaluate.log")
|
| 536 |
+
stream_command(eval_cmd, cwd=str(PROJECT_ROOT), log_path=eval_log)
|
| 537 |
+
|
| 538 |
+
report_path = os.path.join(eval_dir, "evaluation_report.json")
|
| 539 |
+
report = load_eval_report(report_path)
|
| 540 |
+
if report is None:
|
| 541 |
+
raise FileNotFoundError(f"Missing evaluation report: {report_path}")
|
| 542 |
+
|
| 543 |
+
dataset_summaries.append(
|
| 544 |
+
{
|
| 545 |
+
"dataset": dataset,
|
| 546 |
+
"status": "completed",
|
| 547 |
+
"debate_dir": run_dir,
|
| 548 |
+
"accuracy": report.get("accuracy"),
|
| 549 |
+
"correct": report.get("correct"),
|
| 550 |
+
"total": report.get("total_problems"),
|
| 551 |
+
"report_path": report_path,
|
| 552 |
+
}
|
| 553 |
+
)
|
| 554 |
+
processed += 1
|
| 555 |
+
|
| 556 |
+
print("\n" + "=" * 80)
|
| 557 |
+
print("📊 Debate + Evaluation Summary")
|
| 558 |
+
print("=" * 80)
|
| 559 |
+
for item in dataset_summaries:
|
| 560 |
+
dataset = item["dataset"]
|
| 561 |
+
status = item["status"]
|
| 562 |
+
if status == "completed":
|
| 563 |
+
accuracy = item.get("accuracy")
|
| 564 |
+
correct = item.get("correct")
|
| 565 |
+
total = item.get("total")
|
| 566 |
+
print(
|
| 567 |
+
f"{dataset:25s} → accuracy {accuracy:.2%} ({correct}/{total}) | dir: {item['debate_dir']}"
|
| 568 |
+
)
|
| 569 |
+
elif status == "dry_run":
|
| 570 |
+
print(f"{dataset:25s} → dry run (planned dir: {item['debate_dir']})")
|
| 571 |
+
else:
|
| 572 |
+
print(f"{dataset:25s} → {status} (A={item.get('modelA_file')}, B={item.get('modelB_file')})")
|
| 573 |
+
|
| 574 |
+
print("=" * 80)
|
| 575 |
+
if not args.dry_run and processed == 0:
|
| 576 |
+
sys.exit("No datasets were processed successfully.")
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
if __name__ == "__main__":
|
| 580 |
+
main()
|