SorrowTea commited on
Commit
96abbd8
·
verified ·
1 Parent(s): 336f1b6

Upload 45 files

Browse files
Files changed (45) hide show
  1. README.md +215 -3
  2. scripts/__pycache__/run_ablation_suite.cpython-311.pyc +0 -0
  3. scripts/augment_memory_from_standalone_runs.py +16 -0
  4. scripts/build_debate_memory.py +17 -0
  5. scripts/build_debug_memory.py +17 -0
  6. scripts/build_memory_assets.sh +56 -0
  7. scripts/build_memory_from_eval_results.py +17 -0
  8. scripts/execute.py +18 -0
  9. scripts/generate_with_memory.py +17 -0
  10. scripts/process_all_debate_cases.sh +64 -0
  11. scripts/run_ablation_suite.py +403 -0
  12. scripts/run_generate_and_evaluate.sh +640 -0
  13. scripts/run_memory_debate.py +17 -0
  14. scripts/test_self_healing_full.sh +92 -0
  15. src/debate_memory/__init__.py +11 -0
  16. src/debate_memory/__pycache__/__init__.cpython-310.pyc +0 -0
  17. src/debate_memory/__pycache__/__init__.cpython-311.pyc +0 -0
  18. src/debate_memory/__pycache__/build_memory_from_eval_results.cpython-311.pyc +0 -0
  19. src/debate_memory/__pycache__/config.cpython-310.pyc +0 -0
  20. src/debate_memory/__pycache__/config.cpython-311.pyc +0 -0
  21. src/debate_memory/__pycache__/debate_memory_builder.cpython-311.pyc +0 -0
  22. src/debate_memory/__pycache__/debug_executor.cpython-310.pyc +0 -0
  23. src/debate_memory/__pycache__/debug_memory.cpython-310.pyc +0 -0
  24. src/debate_memory/__pycache__/debug_memory_builder.cpython-311.pyc +0 -0
  25. src/debate_memory/__pycache__/generate_with_memory.cpython-310.pyc +0 -0
  26. src/debate_memory/__pycache__/generate_with_memory.cpython-311.pyc +0 -0
  27. src/debate_memory/__pycache__/llm.cpython-310.pyc +0 -0
  28. src/debate_memory/__pycache__/llm.cpython-311.pyc +0 -0
  29. src/debate_memory/__pycache__/memory_bank.cpython-310.pyc +0 -0
  30. src/debate_memory/__pycache__/memory_bank.cpython-311.pyc +0 -0
  31. src/debate_memory/__pycache__/run_memory_debate.cpython-311.pyc +0 -0
  32. src/debate_memory/augment_memory_from_standalone_runs.py +974 -0
  33. src/debate_memory/build_memory_from_eval_results.py +293 -0
  34. src/debate_memory/config.py +189 -0
  35. src/debate_memory/debate_memory_builder.py +477 -0
  36. src/debate_memory/debug_executor.py +136 -0
  37. src/debate_memory/debug_memory.py +163 -0
  38. src/debate_memory/debug_memory_builder.py +150 -0
  39. src/debate_memory/debug_utils.py +99 -0
  40. src/debate_memory/execute.py +522 -0
  41. src/debate_memory/generate_with_memory.py +920 -0
  42. src/debate_memory/llm.py +111 -0
  43. src/debate_memory/memory_bank.py +316 -0
  44. src/debate_memory/memory_intelligence.py +210 -0
  45. src/debate_memory/run_memory_debate.py +580 -0
README.md CHANGED
@@ -1,3 +1,215 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Agora-Opt Code Package
2
+
3
+ ## What This Directory Contains
4
+
5
+ `./code/Agora-Opt/` is the source directory for the Agora-Opt method. It
6
+ retains two categories of assets:
7
+
8
+ - the Agora-Opt implementation
9
+ - prebuilt memory assets used by the method
10
+
11
+ Historical run outputs are not stored here.
12
+
13
+ For compatibility with the original stage naming, the main reproduction script
14
+ maintains two convenience paths:
15
+
16
+ - `generated_with_memory`
17
+ - `debate_runs`
18
+
19
+ ## Important Subdirectories
20
+
21
+ The most important components are:
22
+
23
+ - `src/debate_memory/`: core Agora-Opt implementation
24
+ - `scripts/`: command-line wrappers
25
+ - `memory_storage/`: solution memory
26
+ - `debug_case_memory/`: debug memory retrieval bank
27
+ - `debate_memory_storage/`: debate memory retrieval bank
28
+ - `memory_variants/`: retained alternative memory variants
29
+ - `memory_backups/`: retained memory backups
30
+
31
+ Multiple memory versions are intentionally kept. They were prepared during
32
+ different stages of the project and can all be treated as available assets for
33
+ generation, debugging, and debate.
34
+
35
+ ## Core Workflow
36
+
37
+ Agora-Opt runs in two stages.
38
+
39
+ ### Stage 1: Generate Initial Solutions
40
+
41
+ `generate_with_memory.py` generates candidate solutions, optionally using
42
+ solution memory and debug memory.
43
+
44
+ Primary entry script:
45
+
46
+ - `scripts/generate_with_memory.py`
47
+
48
+ This stage:
49
+
50
+ - reads benchmark problems
51
+ - retrieves similar solved cases from `memory_storage/`
52
+ - generates candidate modeling code
53
+ - uses debug memory during self-repair when execution fails
54
+
55
+ ### Stage 2: Run Debate
56
+
57
+ `run_memory_debate.py` takes two sets of initial solutions and runs the
58
+ decentralized debate stage.
59
+
60
+ Primary entry script:
61
+
62
+ - `scripts/run_memory_debate.py`
63
+
64
+ This stage:
65
+
66
+ - loads both sides' initial solutions
67
+ - retrieves historical debate cases from `debate_memory_storage/`
68
+ - performs iterative comparison, revision, and convergence
69
+ - executes and evaluates the final consensus solution
70
+
71
+ ## Memory Types
72
+
73
+ ### 1. Solution Memory
74
+
75
+ Directory:
76
+
77
+ - `memory_storage/`
78
+
79
+ Purpose:
80
+
81
+ - retrieves similar successful modeling cases during generation
82
+ - supplies formulation templates and structural priors
83
+
84
+ Build path:
85
+
86
+ - extract `(problem description, correct code, objective value)` from correctly
87
+ evaluated runs
88
+ - build `cases.jsonl` plus its retrieval index
89
+
90
+ Related script:
91
+
92
+ - `scripts/build_memory_from_eval_results.py`
93
+
94
+ ### 2. Debug Memory
95
+
96
+ Directory:
97
+
98
+ - `debug_case_memory/`
99
+
100
+ Purpose:
101
+
102
+ - retrieves similar execution failures and repair experience
103
+ - supports automatic self-debugging during generation
104
+
105
+ Build path:
106
+
107
+ - extract unique error signatures from `debug_memory.jsonl` and its backups
108
+ - normalize the error text, repair hints, and metadata into a retrieval bank
109
+
110
+ Related script:
111
+
112
+ - `scripts/build_debug_memory.py`
113
+
114
+ Note:
115
+
116
+ - raw debug logs are stored in `memory_storage/debug_memory.jsonl`
117
+ - that log file is one of the inputs used to build debug memory
118
+
119
+ ### 3. Debate Memory
120
+
121
+ Directory:
122
+
123
+ - `debate_memory_storage/`
124
+
125
+ Purpose:
126
+
127
+ - stores examples of how disagreements were resolved during debate
128
+ - helps later debates converge more efficiently
129
+
130
+ Build path:
131
+
132
+ - select historical runs where the two initial solutions disagreed
133
+ - keep cases where debate eventually converged successfully
134
+ - extract the dispute, key arguments, and final converged code
135
+
136
+ Related scripts:
137
+
138
+ - `scripts/build_debate_memory.py`
139
+ - `scripts/process_all_debate_cases.sh`
140
+
141
+ ## Suggested Memory Construction Order
142
+
143
+ When preparing memory from scratch, the recommended order is:
144
+
145
+ 1. run generation and evaluation to obtain `evaluation_results`
146
+ 2. build solution memory from correct cases
147
+ 3. build debug memory from accumulated `debug_memory.jsonl`
148
+ 4. build debate memory from historical debate runs
149
+
150
+ The dependency flow is:
151
+
152
+ - `evaluation_results` -> `solution memory`
153
+ - `debug_memory.jsonl` -> `debug memory`
154
+ - debate run artifacts -> `debate memory`
155
+
156
+ ## Retained Memory Assets
157
+
158
+ This directory intentionally keeps:
159
+
160
+ - the three primary memory stores
161
+ - memory variants
162
+ - memory backups
163
+
164
+ These are treated as static method assets.
165
+
166
+ Historical run outputs are not retained here, which keeps source code, memory
167
+ assets, and new results clearly separated.
168
+
169
+ To rebuild the three memory types, use:
170
+
171
+ ```bash
172
+ bash ./code/Agora-Opt/scripts/build_memory_assets.sh /path/to/eval_dir1 /path/to/eval_dir2
173
+ ```
174
+
175
+ That script attempts to:
176
+
177
+ - rebuild solution memory from evaluation directories
178
+ - rebuild debug memory from `debug_memory.jsonl` and its backups
179
+ - rebuild debate memory from debate run artifacts
180
+
181
+ ## Recommended Entry Points
182
+
183
+ For paper reproduction, use the outer scripts rather than manually assembling
184
+ commands in this directory:
185
+
186
+ - main table: `./code/scripts/run_agora.sh`
187
+ - 5.1: `./code/experiments/5.1_compatibility_backbone_llms/`
188
+ - 5.2: `./code/experiments/5.2_ablation_study/`
189
+ - 5.3.1: `./code/experiments/5.3.1_centralized_judge_selection/`
190
+ - 5.3.2: `./code/experiments/5.3.2_impact_of_debate_rounds/`
191
+ - 5.3.3:
192
+ `./code/experiments/5.3.3_generalization_of_decentralized_debate_protocol/`
193
+
194
+ ## Direct Source-Level Usage
195
+
196
+ For direct method-level use, the main wrappers are:
197
+
198
+ ```bash
199
+ python scripts/generate_with_memory.py
200
+ python scripts/run_memory_debate.py
201
+ python scripts/execute.py
202
+ python scripts/build_memory_from_eval_results.py
203
+ python scripts/build_debug_memory.py
204
+ python scripts/build_debate_memory.py
205
+ ```
206
+
207
+ ## Path Conventions
208
+
209
+ Within the open-source package, the intended layout is:
210
+
211
+ - benchmark data: `./data/benchmarks/`
212
+ - Agora-Opt source code and memory: `./code/Agora-Opt/`
213
+
214
+ This separation makes the boundaries between code, memory assets, and newly
215
+ generated outputs explicit.
scripts/__pycache__/run_ablation_suite.cpython-311.pyc ADDED
Binary file (19.3 kB). View file
 
scripts/augment_memory_from_standalone_runs.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Wrapper for debate_memory.augment_memory_from_standalone_runs."""
3
+
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
8
+ SRC_DIR = PROJECT_ROOT / "src"
9
+ if str(SRC_DIR) not in sys.path:
10
+ sys.path.insert(0, str(SRC_DIR))
11
+
12
+ from debate_memory.augment_memory_from_standalone_runs import main
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
scripts/build_debate_memory.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Wrapper for debate_memory.debate_memory_builder."""
3
+
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
8
+ SRC_DIR = PROJECT_ROOT / "src"
9
+ if str(SRC_DIR) not in sys.path:
10
+ sys.path.insert(0, str(SRC_DIR))
11
+
12
+ from debate_memory.debate_memory_builder import main
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
17
+
scripts/build_debug_memory.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Wrapper to consolidate debug_memory.jsonl entries into a memory bank."""
3
+
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
8
+ SRC_DIR = PROJECT_ROOT / "src"
9
+ if str(SRC_DIR) not in sys.path:
10
+ sys.path.insert(0, str(SRC_DIR))
11
+
12
+ from debate_memory.debug_memory_builder import main
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
17
+
scripts/build_memory_assets.sh ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+ AGORA_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
7
+ OPEN_ROOT="$(cd "${AGORA_DIR}/../.." && pwd)"
8
+ RESULTS_ROOT="${OPEN_ROOT}/results"
9
+ BENCHMARK_DIR="${OPEN_ROOT}/data/benchmarks"
10
+ PYTHON_BIN="${PYTHON_BIN:-python3}"
11
+
12
+ SOLUTION_MEMORY_DIR="${SOLUTION_MEMORY_DIR:-${AGORA_DIR}/memory_storage}"
13
+ DEBUG_CASE_MEMORY_DIR="${DEBUG_CASE_MEMORY_DIR:-${AGORA_DIR}/debug_case_memory}"
14
+ DEBATE_MEMORY_DIR="${DEBATE_MEMORY_DIR:-${AGORA_DIR}/debate_memory_storage}"
15
+ DEBATE_RUNS_ROOT="${DEBATE_RUNS_ROOT:-${RESULTS_ROOT}/Agora-Opt/debate}"
16
+
17
+ export PYTHONPATH="${AGORA_DIR}/src:${PYTHONPATH:-}"
18
+
19
+ echo "============================================================"
20
+ echo "Agora-Opt Memory Builder"
21
+ echo "============================================================"
22
+ echo "Solution memory: ${SOLUTION_MEMORY_DIR}"
23
+ echo "Debug memory: ${DEBUG_CASE_MEMORY_DIR}"
24
+ echo "Debate memory: ${DEBATE_MEMORY_DIR}"
25
+ echo "Debate runs: ${DEBATE_RUNS_ROOT}"
26
+ echo "============================================================"
27
+ echo
28
+
29
+ if [[ "$#" -gt 0 ]]; then
30
+ echo "Building solution memory from evaluation directories..."
31
+ "${PYTHON_BIN}" "${SCRIPT_DIR}/build_memory_from_eval_results.py" \
32
+ --eval_dirs "$@" \
33
+ --benchmarks_dir "${BENCHMARK_DIR}" \
34
+ --memory_dir "${SOLUTION_MEMORY_DIR}"
35
+ echo
36
+ else
37
+ echo "Skipping solution memory rebuild because no evaluation directories were provided."
38
+ echo "Usage example:"
39
+ echo " bash ./code/Agora-Opt/scripts/build_memory_assets.sh /path/to/eval_dir1 /path/to/eval_dir2"
40
+ echo
41
+ fi
42
+
43
+ echo "Building debug memory..."
44
+ "${PYTHON_BIN}" "${SCRIPT_DIR}/build_debug_memory.py" \
45
+ --output_dir "${DEBUG_CASE_MEMORY_DIR}"
46
+ echo
47
+
48
+ if [[ -d "${DEBATE_RUNS_ROOT}" ]]; then
49
+ echo "Building debate memory..."
50
+ "${PYTHON_BIN}" "${SCRIPT_DIR}/build_debate_memory.py" \
51
+ --runs_root "${DEBATE_RUNS_ROOT}" \
52
+ --output_dir "${DEBATE_MEMORY_DIR}"
53
+ else
54
+ echo "Skipping debate memory rebuild because debate runs root does not exist:"
55
+ echo " ${DEBATE_RUNS_ROOT}"
56
+ fi
scripts/build_memory_from_eval_results.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Wrapper for debate_memory.build_memory_from_eval_results."""
3
+
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
8
+ SRC_DIR = PROJECT_ROOT / "src"
9
+ if str(SRC_DIR) not in sys.path:
10
+ sys.path.insert(0, str(SRC_DIR))
11
+
12
+ from debate_memory.build_memory_from_eval_results import main
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
17
+
scripts/execute.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Wrapper to run debate_memory.execute with package imports resolved."""
3
+
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
8
+ SRC_DIR = PROJECT_ROOT / "src"
9
+ if str(SRC_DIR) not in sys.path:
10
+ sys.path.insert(0, str(SRC_DIR))
11
+
12
+ from debate_memory.execute import parse_args, main
13
+
14
+
15
+ if __name__ == "__main__":
16
+ args = parse_args()
17
+ main(args)
18
+
scripts/generate_with_memory.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Wrapper to run debate_memory.generate_with_memory as a script."""
3
+
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
8
+ SRC_DIR = PROJECT_ROOT / "src"
9
+ if str(SRC_DIR) not in sys.path:
10
+ sys.path.insert(0, str(SRC_DIR))
11
+
12
+ from debate_memory.generate_with_memory import main
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
17
+
scripts/process_all_debate_cases.sh ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Batch process every historical debate run and refresh the debate memory bank.
4
+ #
5
+ # Usage:
6
+ # ./scripts/process_all_debate_cases.sh [runs_root] [output_dir]
7
+ # Example:
8
+ # ./scripts/process_all_debate_cases.sh \
9
+ # ../../results/Agora-Opt/debate \
10
+ # debate_memory_storage
11
+ #
12
+ # Environment variables (optional):
13
+ # LLM_MODEL - override default gpt-4o summarizer
14
+ # LLM_ATTEMPTS - retries per case (default 2)
15
+ # MAX_WORKERS - thread pool size (default 64)
16
+ # PYTHON_BIN - python executable (default python)
17
+
18
+ set -euo pipefail
19
+
20
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
21
+ PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
22
+ DEFAULT_RUNS_ROOT="${PROJECT_ROOT}/../../results/Agora-Opt/debate"
23
+
24
+ RUNS_ROOT="${1:-$DEFAULT_RUNS_ROOT}"
25
+ OUTPUT_DIR="${2:-${PROJECT_ROOT}/debate_memory_storage}"
26
+
27
+ LLM_MODEL="${LLM_MODEL:-gpt-4o}"
28
+ LLM_ATTEMPTS="${LLM_ATTEMPTS:-2}"
29
+ MAX_WORKERS="${MAX_WORKERS:-64}"
30
+ PYTHON_BIN="${PYTHON_BIN:-python}"
31
+
32
+ echo "============================================================"
33
+ echo "🧠 Building Debate Memory"
34
+ echo "============================================================"
35
+ echo "Runs root: ${RUNS_ROOT}"
36
+ echo "Output dir: ${OUTPUT_DIR}"
37
+ echo "LLM model: ${LLM_MODEL:-<heuristic>}"
38
+ echo "LLM attempts: ${LLM_ATTEMPTS}"
39
+ echo "Max workers: ${MAX_WORKERS}"
40
+ echo "Python binary: ${PYTHON_BIN}"
41
+ echo "============================================================"
42
+ echo
43
+
44
+ CMD=(
45
+ "${PYTHON_BIN}"
46
+ "${PROJECT_ROOT}/scripts/build_debate_memory.py"
47
+ "--runs_root" "${RUNS_ROOT}"
48
+ "--output_dir" "${OUTPUT_DIR}"
49
+ "--max_workers" "${MAX_WORKERS}"
50
+ "--llm_attempts" "${LLM_ATTEMPTS}"
51
+ )
52
+
53
+ if [ -n "${LLM_MODEL}" ]; then
54
+ CMD+=("--llm_model" "${LLM_MODEL}")
55
+ fi
56
+
57
+ echo "Running: ${CMD[*]}"
58
+ echo
59
+
60
+ "${CMD[@]}"
61
+
62
+ echo
63
+ echo "✅ Debate memory refreshed."
64
+ echo "Cases stored in: ${OUTPUT_DIR}"
scripts/run_ablation_suite.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Run a suite of ablation experiments (generation + evaluation) and summarise results.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import shlex
12
+ import subprocess
13
+ import sys
14
+ from dataclasses import dataclass
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import Dict, List, Sequence, Tuple
18
+
19
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
20
+ STANDARD_RESULTS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt"
21
+ GENERATE_SCRIPT = PROJECT_ROOT / "scripts" / "generate_with_memory.py"
22
+ EXECUTE_SCRIPT = PROJECT_ROOT / "scripts" / "execute.py"
23
+ PYTHON_BIN = os.environ.get("PYTHON_BIN", sys.executable)
24
+
25
+
26
+ @dataclass
27
+ class Variant:
28
+ name: str
29
+ description: str
30
+ overrides: Dict[str, object]
31
+
32
+
33
+ def parse_args() -> argparse.Namespace:
34
+ parser = argparse.ArgumentParser(
35
+ description="Run generate+evaluate ablations and emit a summary table."
36
+ )
37
+ parser.add_argument("--model", type=str, default="gpt-4o", help="LLM to query.")
38
+ parser.add_argument(
39
+ "--datasets",
40
+ nargs="+",
41
+ default=["IndustryOR", "ComplexLP"],
42
+ help="Datasets to evaluate (space-separated, omit .jsonl).",
43
+ )
44
+ parser.add_argument("--temperature", type=float, default=0.01)
45
+ parser.add_argument(
46
+ "--max_problems",
47
+ type=int,
48
+ default=None,
49
+ help="Limit number of problems per dataset (omit for full set).",
50
+ )
51
+ parser.add_argument("--memory_dir", type=str, default="memory_storage")
52
+ parser.add_argument(
53
+ "--memory_top_k",
54
+ type=int,
55
+ default=3,
56
+ help="Base episodic memory retrieval count for the full variant.",
57
+ )
58
+ parser.add_argument(
59
+ "--max_retries",
60
+ type=int,
61
+ default=5,
62
+ help="Base retry budget for the full variant.",
63
+ )
64
+ parser.add_argument(
65
+ "--debug_case_top_k",
66
+ type=int,
67
+ default=3,
68
+ help="Base debug-case retrieval count.",
69
+ )
70
+ parser.add_argument(
71
+ "--parallel",
72
+ type=int,
73
+ default=64,
74
+ help="Workers for generation (passed to --parallel).",
75
+ )
76
+ parser.add_argument(
77
+ "--execution_timeout",
78
+ type=int,
79
+ default=90,
80
+ help="Timeout per execution attempt in generate_with_memory.",
81
+ )
82
+ parser.add_argument(
83
+ "--debug_memory_path",
84
+ type=str,
85
+ default="memory_storage/debug_memory.jsonl",
86
+ help="Path to debug memory JSONL.",
87
+ )
88
+ parser.add_argument(
89
+ "--debug_case_dir",
90
+ type=str,
91
+ default="debug_case_memory",
92
+ help="Directory containing consolidated debug-case memory.",
93
+ )
94
+ parser.add_argument(
95
+ "--output_root",
96
+ type=str,
97
+ default=str(STANDARD_RESULTS_ROOT / "ablations"),
98
+ help="Root folder for storing ablation artefacts.",
99
+ )
100
+ parser.add_argument(
101
+ "--eval_timeout",
102
+ type=int,
103
+ default=90,
104
+ help="Timeout for scripts/execute.py.",
105
+ )
106
+ parser.add_argument(
107
+ "--num_workers",
108
+ type=int,
109
+ default=64,
110
+ help="ProcessPool workers for evaluation.",
111
+ )
112
+ parser.add_argument("--tolerance", type=float, default=0.05)
113
+ parser.add_argument(
114
+ "--relative_tolerance",
115
+ action="store_true",
116
+ help="Use relative tolerance in evaluation.",
117
+ )
118
+ parser.add_argument(
119
+ "--dry_run",
120
+ action="store_true",
121
+ help="Print commands without executing or aggregating results.",
122
+ )
123
+ return parser.parse_args()
124
+
125
+
126
+ def build_variants(args: argparse.Namespace) -> List[Variant]:
127
+ base = {
128
+ "memory_top_k": args.memory_top_k,
129
+ "use_llm_refinement": True,
130
+ "debug_case_memory_top_k": args.debug_case_top_k,
131
+ "max_retries": args.max_retries,
132
+ "auto_debug": True,
133
+ }
134
+ return [
135
+ Variant(
136
+ name="full_system",
137
+ description="All helpers enabled (reference).",
138
+ overrides={**base},
139
+ ),
140
+ Variant(
141
+ name="no_llm_refine",
142
+ description="Skip LLM summarisation of retrieved cases.",
143
+ overrides={**base, "use_llm_refinement": False},
144
+ ),
145
+ Variant(
146
+ name="no_debug_case_memory",
147
+ description="Disable historical debug-case retrieval.",
148
+ overrides={**base, "debug_case_memory_top_k": 0},
149
+ ),
150
+ Variant(
151
+ name="no_self_healing",
152
+ description="Single attempt (max_retries=1) but still executes locally once.",
153
+ overrides={**base, "max_retries": 1},
154
+ ),
155
+ Variant(
156
+ name="no_memory",
157
+ description="Disable episodic retrieval, keep retries on.",
158
+ overrides={**base, "memory_top_k": 0, "use_llm_refinement": False},
159
+ ),
160
+ Variant(
161
+ name="vanilla_llm",
162
+ description="Pure single-shot LLM (no memory, no auto-debug).",
163
+ overrides={
164
+ **base,
165
+ "memory_top_k": 0,
166
+ "use_llm_refinement": False,
167
+ "debug_case_memory_top_k": 0,
168
+ "max_retries": 1,
169
+ "auto_debug": False,
170
+ },
171
+ ),
172
+ ]
173
+
174
+
175
+ def run_command(cmd: Sequence[str], dry_run: bool = False) -> None:
176
+ pretty = " ".join(shlex.quote(part) for part in cmd)
177
+ print(f" → {pretty}")
178
+ if dry_run:
179
+ return
180
+ subprocess.run(cmd, check=True)
181
+
182
+
183
+ def compute_attempt_stats(path: Path) -> Tuple[float, int]:
184
+ if not path.exists():
185
+ return 0.0, 0
186
+ total = 0
187
+ total_attempts = 0
188
+ multi_attempt = 0
189
+ with path.open("r", encoding="utf-8") as handle:
190
+ for line in handle:
191
+ line = line.strip()
192
+ if not line:
193
+ continue
194
+ record = json.loads(line)
195
+ attempts = record.get("total_attempts", 1)
196
+ total_attempts += attempts
197
+ total += 1
198
+ if attempts > 1:
199
+ multi_attempt += 1
200
+ avg = (total_attempts / total) if total else 0.0
201
+ return avg, multi_attempt
202
+
203
+
204
+ def format_percent(value: float) -> str:
205
+ return f"{value * 100:.1f}%"
206
+
207
+
208
+ def build_generate_args(
209
+ dataset: str,
210
+ output_file: Path,
211
+ debug_dir: Path,
212
+ args: argparse.Namespace,
213
+ cfg: Dict[str, object],
214
+ ) -> List[str]:
215
+ cmd = [
216
+ os.fspath(GENERATE_SCRIPT),
217
+ "--dataset",
218
+ dataset,
219
+ "--model",
220
+ args.model,
221
+ "--temperature",
222
+ str(args.temperature),
223
+ "--output",
224
+ os.fspath(output_file),
225
+ "--memory_dir",
226
+ os.fspath(Path(args.memory_dir).resolve()),
227
+ "--parallel",
228
+ str(args.parallel),
229
+ "--execution_timeout",
230
+ str(args.execution_timeout),
231
+ "--debug_memory_path",
232
+ os.fspath(Path(args.debug_memory_path).resolve()),
233
+ "--debug_case_memory_dir",
234
+ os.fspath(Path(args.debug_case_dir).resolve()),
235
+ "--debug_case_memory_top_k",
236
+ str(int(cfg.get("debug_case_memory_top_k", 0))),
237
+ "--memory_top_k",
238
+ str(int(cfg.get("memory_top_k", 0))),
239
+ "--max_retries",
240
+ str(int(cfg.get("max_retries", 1))),
241
+ ]
242
+ if args.max_problems:
243
+ cmd += ["--max_problems", str(args.max_problems)]
244
+ if cfg.get("use_llm_refinement"):
245
+ cmd.append("--use_llm_refinement")
246
+ if not cfg.get("filter_perfect", True):
247
+ cmd.append("--no_filter_perfect")
248
+ if not cfg.get("auto_debug", True):
249
+ cmd.append("--no_auto_debug")
250
+ if debug_dir:
251
+ cmd += ["--debug_output_dir", os.fspath(debug_dir)]
252
+ return [os.fspath(part) for part in cmd]
253
+
254
+
255
+ def build_execute_args(input_file: Path, output_dir: Path, args: argparse.Namespace) -> List[str]:
256
+ cmd = [
257
+ os.fspath(EXECUTE_SCRIPT),
258
+ "--input_file",
259
+ os.fspath(input_file),
260
+ "--output_dir",
261
+ os.fspath(output_dir),
262
+ "--timeout",
263
+ str(args.eval_timeout),
264
+ "--tolerance",
265
+ str(args.tolerance),
266
+ "--num_workers",
267
+ str(args.num_workers),
268
+ "--memory_dir",
269
+ os.fspath(Path(args.memory_dir).resolve()),
270
+ "--debug_memory_path",
271
+ os.fspath(Path(args.debug_memory_path).resolve()),
272
+ ]
273
+ if args.relative_tolerance:
274
+ cmd.append("--use_relative_tolerance")
275
+ return cmd
276
+
277
+
278
+ def summarise_records(records: List[Dict], summary_path: Path) -> None:
279
+ if not records:
280
+ return
281
+ md_lines = [
282
+ "| Dataset | Variant | Accuracy | Correct/Total | Exec Err % | Timeout % | No-Code % | Avg Attempts | Notes |",
283
+ "| --- | --- | --- | --- | --- | --- | --- | --- | --- |",
284
+ ]
285
+ csv_lines = [
286
+ "dataset,variant,accuracy,correct,total,exec_error_pct,timeout_pct,no_code_pct,avg_attempts,notes"
287
+ ]
288
+ for record in records:
289
+ dataset = record["dataset"]
290
+ variant = record["variant"]
291
+ report = record["report"]
292
+ status_counts = report.get("status_counts", {})
293
+ total = report.get("total_problems", 0)
294
+ accuracy_pct = format_percent(report.get("accuracy", 0.0))
295
+ correct = report.get("correct", 0)
296
+ exec_err_pct = (
297
+ (status_counts.get("execution_error", 0) / total) if total else 0.0
298
+ )
299
+ timeout_pct = (status_counts.get("timeout", 0) / total) if total else 0.0
300
+ no_code_pct = (status_counts.get("no_code", 0) / total) if total else 0.0
301
+ avg_attempts = record.get("avg_attempts", 0.0)
302
+ notes = record["notes"]
303
+ md_lines.append(
304
+ f"| {dataset} | {variant} | {accuracy_pct} | {correct}/{total} | "
305
+ f"{exec_err_pct*100:.1f}% | {timeout_pct*100:.1f}% | {no_code_pct*100:.1f}% | "
306
+ f"{avg_attempts:.2f} | {notes} |"
307
+ )
308
+ safe_notes = notes.replace('"', '""')
309
+ csv_lines.append(
310
+ f"{dataset},{variant},{report.get('accuracy',0.0):.4f},{correct},{total},"
311
+ f"{exec_err_pct:.4f},{timeout_pct:.4f},{no_code_pct:.4f},{avg_attempts:.4f},\"{safe_notes}\""
312
+ )
313
+ summary_path.write_text("\n".join(md_lines) + "\n", encoding="utf-8")
314
+ csv_path = summary_path.with_suffix(".csv")
315
+ csv_path.write_text("\n".join(csv_lines) + "\n", encoding="utf-8")
316
+ print(f"\n✅ Summary table written to: {summary_path}")
317
+ print(f"📄 CSV export written to: {csv_path}")
318
+
319
+
320
+ def main() -> None:
321
+ args = parse_args()
322
+ variants = build_variants(args)
323
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
324
+ run_root = Path(args.output_root).resolve() / timestamp
325
+ if not args.dry_run:
326
+ run_root.mkdir(parents=True, exist_ok=True)
327
+
328
+ print("========================================")
329
+ print("Ablation Runner")
330
+ print("========================================")
331
+ print(f"Model: {args.model}")
332
+ print(f"Datasets: {', '.join(args.datasets)}")
333
+ print(f"Output root: {run_root if not args.dry_run else args.output_root}")
334
+ print(f"Dry run: {args.dry_run}")
335
+ print("========================================\n")
336
+
337
+ records: List[Dict] = []
338
+ for dataset in args.datasets:
339
+ print(f"Dataset: {dataset}")
340
+ for variant in variants:
341
+ cfg = variant.overrides
342
+ variant_name = variant.name
343
+ print(f" Variant: {variant_name} – {variant.description}")
344
+ dataset_slug = dataset.replace("/", "_")
345
+ gen_output = (
346
+ run_root / f"{dataset_slug}_{variant_name}.jsonl"
347
+ if not args.dry_run
348
+ else Path(f"{dataset_slug}_{variant_name}.jsonl")
349
+ )
350
+ debug_dir = (
351
+ run_root / "debug" / dataset_slug / variant_name
352
+ if not args.dry_run
353
+ else Path(f"debug/{dataset_slug}/{variant_name}")
354
+ )
355
+ eval_dir = (
356
+ run_root / f"{dataset_slug}_{variant_name}_eval"
357
+ if not args.dry_run
358
+ else Path(f"{dataset_slug}_{variant_name}_eval")
359
+ )
360
+ if not args.dry_run:
361
+ debug_dir.mkdir(parents=True, exist_ok=True)
362
+ gen_cmd = [PYTHON_BIN] + build_generate_args(
363
+ dataset, gen_output, debug_dir, args, cfg
364
+ )
365
+ run_command(gen_cmd, dry_run=args.dry_run)
366
+
367
+ exec_cmd = [
368
+ PYTHON_BIN,
369
+ ] + build_execute_args(gen_output, eval_dir, args)
370
+ run_command(exec_cmd, dry_run=args.dry_run)
371
+
372
+ if args.dry_run:
373
+ continue
374
+
375
+ report_path = eval_dir / "evaluation_report.json"
376
+ if not report_path.exists():
377
+ raise FileNotFoundError(
378
+ f"Missing evaluation report for {dataset} / {variant_name}: {report_path}"
379
+ )
380
+ with report_path.open("r", encoding="utf-8") as handle:
381
+ report = json.load(handle)
382
+ avg_attempts, _ = compute_attempt_stats(gen_output)
383
+ records.append(
384
+ {
385
+ "dataset": dataset,
386
+ "variant": variant_name,
387
+ "report": report,
388
+ "avg_attempts": avg_attempts,
389
+ "notes": variant.description,
390
+ }
391
+ )
392
+ print("")
393
+
394
+ if args.dry_run:
395
+ print("Dry run completed. No commands were executed.")
396
+ return
397
+
398
+ summary_path = run_root / "ablation_summary.md"
399
+ summarise_records(records, summary_path)
400
+
401
+
402
+ if __name__ == "__main__":
403
+ main()
scripts/run_generate_and_evaluate.sh ADDED
@@ -0,0 +1,640 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -uo pipefail
4
+
5
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+ PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
7
+ OPEN_ROOT="$(cd "${PROJECT_ROOT}/../.." && pwd)"
8
+ SRC_DIR="${PROJECT_ROOT}/src"
9
+ export PYTHONPATH="${SRC_DIR}:${PYTHONPATH:-}"
10
+
11
+ # Generate and Evaluate - Combined pipeline for generation + evaluation
12
+ # Usage: ./run_generate_and_evaluate.sh [model_name] [max_problems] [num_workers] [timeout] [tolerance] [dataset_name]
13
+ #
14
+ # Environment Variables:
15
+ # REFRESH_DEBUG_MEMORY - Set to "false" to disable auto-backup and clearing of debug memory (default: true)
16
+ # RUN_ALL_BENCHMARKS - Set to "true" to run all benchmarks in ./data/benchmarks/ (default: true)
17
+ # USE_HF_OFFLINE - Set to "false" to allow downloading models from Hugging Face (default: true)
18
+ # PARALLEL_BENCHMARKS - Set to "true" to run benchmarks in parallel (default: true)
19
+ # MAX_PARALLEL_JOBS - Maximum number of parallel jobs (default: 4)
20
+ # DATASET_NAME - Dataset to run when RUN_ALL_BENCHMARKS=false (default: IndustryOR)
21
+ # EMBEDDING_MODEL - Optional embedding model name or local path passed to memory retrieval
22
+ #
23
+ # Example:
24
+ # ./run_generate_and_evaluate.sh # Run with default settings (all benchmarks, offline mode, parallel)
25
+ # RUN_ALL_BENCHMARKS=false ./run_generate_and_evaluate.sh # Run single dataset
26
+ # RUN_ALL_BENCHMARKS=false ./run_generate_and_evaluate.sh gpt-4o 100 64 90 0.05 OPT-Principled
27
+ # USE_HF_OFFLINE=false ./run_generate_and_evaluate.sh # Allow downloading models
28
+ # REFRESH_DEBUG_MEMORY=false ./run_generate_and_evaluate.sh # Run without refreshing debug memory
29
+ # PARALLEL_BENCHMARKS=false ./run_generate_and_evaluate.sh # Run sequentially
30
+ # MAX_PARALLEL_JOBS=2 ./run_generate_and_evaluate.sh # Limit to 2 parallel jobs
31
+
32
+ MODEL=${1:-"gpt-4o"}
33
+ MAX_PROBLEMS=${2:-1000}
34
+ NUM_WORKERS=${3:-100}
35
+ TIMEOUT=${4:-60}
36
+ TOLERANCE=${5:-0.05}
37
+
38
+ # Configuration: Auto-backup and clear debug memory before running
39
+ # Set to "false" to disable this feature
40
+ REFRESH_DEBUG_MEMORY=${REFRESH_DEBUG_MEMORY:-true}
41
+
42
+ # Configuration: Run all benchmarks or single dataset
43
+ RUN_ALL_BENCHMARKS=${RUN_ALL_BENCHMARKS:-true}
44
+
45
+ # Configuration: Use offline mode for Hugging Face (avoid network calls)
46
+ # Set to "false" if you need to download models for the first time
47
+ USE_HF_OFFLINE=${USE_HF_OFFLINE:-true}
48
+
49
+ # Configuration: Run benchmarks in parallel
50
+ # Set to "true" to enable concurrent datasets (default: sequential datasets)
51
+ PARALLEL_BENCHMARKS=${PARALLEL_BENCHMARKS:-false}
52
+
53
+ # Configuration: Maximum number of parallel jobs
54
+ # Adjust based on your system resources
55
+ MAX_PARALLEL_JOBS=${MAX_PARALLEL_JOBS:-4}
56
+
57
+ # Default single dataset
58
+ DEFAULT_DATASET=${DATASET_NAME:-${6:-"IndustryOR"}}
59
+ # DEFAULT_DATASET="ComplexOR"
60
+ TEMPERATURE=${TEMPERATURE:-0.01}
61
+ MEMORY_DIR="${PROJECT_ROOT}/memory_storage"
62
+ MEMORY_TOP_K=${MEMORY_TOP_K:-3}
63
+ PARALLEL=${PARALLEL:-128}
64
+ MAIN_TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
65
+ OUTPUT_DIR="${OPEN_ROOT}/results/Agora-Opt/generate_and_evaluate"
66
+ MAX_RETRIES=${MAX_RETRIES:-5}
67
+ BENCHMARKS_DIR="${PROJECT_ROOT}/../../data/benchmarks"
68
+ EMBEDDING_MODEL=${EMBEDDING_MODEL:-}
69
+
70
+ GENERATE_CLI="${PROJECT_ROOT}/scripts/generate_with_memory.py"
71
+ EXECUTE_CLI="${PROJECT_ROOT}/scripts/execute.py"
72
+
73
+ if [ -d "${BENCHMARKS_DIR}" ]; then
74
+ BENCHMARKS_DIR="$(cd "${BENCHMARKS_DIR}" && pwd)"
75
+ elif [ -d "${PROJECT_ROOT}/clean_benchmarks" ]; then
76
+ BENCHMARKS_DIR="$(cd "${PROJECT_ROOT}/clean_benchmarks" && pwd)"
77
+ elif [ -d "${PROJECT_ROOT}/../clean_benchmarks" ]; then
78
+ BENCHMARKS_DIR="$(cd "${PROJECT_ROOT}/../clean_benchmarks" && pwd)"
79
+ fi
80
+
81
+ # Create output directory
82
+ mkdir -p "${OUTPUT_DIR}"
83
+
84
+ ensure_or_debate_env() {
85
+ if [ "${CONDA_DEFAULT_ENV:-}" = "or-debate" ] && command -v python >/dev/null 2>&1; then
86
+ return 0
87
+ fi
88
+
89
+ if ! command -v conda >/dev/null 2>&1; then
90
+ echo "❌ conda command not found. Please install Conda or activate the or-debate environment manually."
91
+ return 1
92
+ fi
93
+
94
+ local conda_bin
95
+ local conda_base
96
+ conda_bin="$(command -v conda)"
97
+ conda_base="$(cd "$(dirname "${conda_bin}")/.." && pwd)"
98
+
99
+ if [ -f "${conda_base}/etc/profile.d/conda.sh" ]; then
100
+ # shellcheck disable=SC1090
101
+ source "${conda_base}/etc/profile.d/conda.sh"
102
+ else
103
+ eval "$("${conda_bin}" shell.bash hook)"
104
+ fi
105
+
106
+ conda activate or-debate
107
+ }
108
+
109
+ # ============================================
110
+ # Function: Backup and Clear Debug Memory
111
+ # ============================================
112
+ backup_debug_memory() {
113
+ if [ "${REFRESH_DEBUG_MEMORY}" = "true" ]; then
114
+ DEBUG_MEMORY_FILE="${MEMORY_DIR}/debug_memory.jsonl"
115
+ BACKUP_DIR="${MEMORY_DIR}/backups/${MAIN_TIMESTAMP}"
116
+
117
+ if [ -f "${DEBUG_MEMORY_FILE}" ]; then
118
+ echo "================================================"
119
+ echo "🗂️ Backing up debug memory..."
120
+ echo "================================================"
121
+
122
+ # Create backup directory
123
+ mkdir -p ${BACKUP_DIR}
124
+
125
+ # Copy debug_memory.jsonl to backup
126
+ cp "${DEBUG_MEMORY_FILE}" "${BACKUP_DIR}/debug_memory.jsonl"
127
+
128
+ # Get file size and line count
129
+ FILE_SIZE=$(du -h "${DEBUG_MEMORY_FILE}" | cut -f1)
130
+ LINE_COUNT=$(wc -l < "${DEBUG_MEMORY_FILE}")
131
+
132
+ echo "✅ Backed up debug memory:"
133
+ echo " Location: ${BACKUP_DIR}/debug_memory.jsonl"
134
+ echo " Size: ${FILE_SIZE}"
135
+ echo " Lines: ${LINE_COUNT}"
136
+
137
+ # Clear the original file
138
+ > "${DEBUG_MEMORY_FILE}"
139
+ echo "✅ Cleared original debug memory file"
140
+ echo ""
141
+ else
142
+ echo "ℹ️ No debug memory file found, skipping backup"
143
+ echo ""
144
+ fi
145
+ else
146
+ echo "ℹ️ Debug memory refresh is disabled (REFRESH_DEBUG_MEMORY=false)"
147
+ echo ""
148
+ fi
149
+ }
150
+
151
+ normalize_dataset_name() {
152
+ local dataset_name="$1"
153
+ dataset_name="${dataset_name%.jsonl}"
154
+ case "${dataset_name}" in
155
+ ComplexLP_clean) echo "ComplexLP" ;;
156
+ EasyLP_clean) echo "EasyLP" ;;
157
+ IndustryOR_clean|IndustryOR_v2|IndustryOR_fixedV2|IndustryOR_fixedV2_clean) echo "IndustryOR" ;;
158
+ NL4Opt|NL4Opt_clean|NL4OPT_clean) echo "NL4OPT" ;;
159
+ NLP4LP_clean) echo "NLP4LP" ;;
160
+ ComplexOR_clean) echo "ComplexOR" ;;
161
+ ReSocratic_clean) echo "ReSocratic" ;;
162
+ combined|combined_dataset|OPT-Principled_clean) echo "OPT-Principled" ;;
163
+ *) echo "${dataset_name}" ;;
164
+ esac
165
+ }
166
+
167
+ DEFAULT_DATASET="$(normalize_dataset_name "${DEFAULT_DATASET}")"
168
+
169
+ # ============================================
170
+ # Function: Run single dataset (core logic)
171
+ # ============================================
172
+ process_dataset() {
173
+ local DATASET_NAME
174
+ DATASET_NAME="$(normalize_dataset_name "$1")"
175
+ local TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
176
+ local OUTPUT_FILE="${OUTPUT_DIR}/${MODEL}_${DATASET_NAME}_${TIMESTAMP}.jsonl"
177
+ local EVAL_FILE="${OUTPUT_DIR}/${MODEL}_${DATASET_NAME}_eval_${TIMESTAMP}.jsonl"
178
+ local EVAL_REPORT="${EVAL_FILE}/evaluation_report.json"
179
+
180
+ echo ""
181
+ echo "╔════════════════════════════════════════════════╗"
182
+ echo "║ Processing Dataset: ${DATASET_NAME}"
183
+ echo "╚════════════════════════════════════════════════╝"
184
+ echo ""
185
+
186
+ # ============================================
187
+ # STEP 1: Generation
188
+ # ============================================
189
+ echo "================================================"
190
+ echo "📝 STEP 1/2: Generating code with memory..."
191
+ echo "================================================"
192
+ echo "Dataset: ${DATASET_NAME}"
193
+ echo ""
194
+
195
+ local generate_args=(
196
+ --dataset "${DATASET_NAME}"
197
+ --model "${MODEL}"
198
+ --temperature "${TEMPERATURE}"
199
+ --max_problems "${MAX_PROBLEMS}"
200
+ --memory_dir "${MEMORY_DIR}"
201
+ --memory_top_k "${MEMORY_TOP_K}"
202
+ --parallel "${PARALLEL}"
203
+ --output "${OUTPUT_FILE}"
204
+ --max_retries "${MAX_RETRIES}"
205
+ --execution_timeout 60
206
+ )
207
+
208
+ if [ -n "${EMBEDDING_MODEL}" ]; then
209
+ generate_args+=(--embedding_model "${EMBEDDING_MODEL}")
210
+ fi
211
+
212
+ python "${GENERATE_CLI}" "${generate_args[@]}"
213
+
214
+ EXIT_CODE=$?
215
+
216
+ if [ ${EXIT_CODE} -ne 0 ]; then
217
+ echo ""
218
+ echo "❌ Generation failed for ${DATASET_NAME} with exit code ${EXIT_CODE}"
219
+ return 1
220
+ fi
221
+
222
+ echo ""
223
+ echo "✅ Generation completed for ${DATASET_NAME}!"
224
+ echo ""
225
+
226
+ # Show generation summary
227
+ if [ -f "${OUTPUT_FILE}" ]; then
228
+ TOTAL=$(wc -l < ${OUTPUT_FILE})
229
+ SUCCESS=$(grep -c '"status": "success"' "${OUTPUT_FILE}" 2>/dev/null || true)
230
+ if [ -z "${SUCCESS}" ]; then
231
+ SUCCESS=0
232
+ fi
233
+ echo "📊 Generation Summary:"
234
+ echo " Total problems: ${TOTAL}"
235
+ echo " Successful: ${SUCCESS}"
236
+
237
+ if [ "${SUCCESS}" -eq 0 ]; then
238
+ echo ""
239
+ echo "❌ Generation produced zero successful solutions for ${DATASET_NAME}"
240
+ echo " Refusing to continue with an incomplete run."
241
+ return 1
242
+ fi
243
+ fi
244
+
245
+ echo ""
246
+
247
+ # ============================================
248
+ # STEP 2: Evaluation
249
+ # ============================================
250
+ echo "================================================"
251
+ echo "🔍 STEP 2/2: Executing and evaluating..."
252
+ echo "================================================"
253
+ echo ""
254
+
255
+ local execute_args=(
256
+ --input_file "${OUTPUT_FILE}"
257
+ --output_dir "${EVAL_FILE}"
258
+ --num_workers "${NUM_WORKERS}"
259
+ --timeout "${TIMEOUT}"
260
+ --tolerance "${TOLERANCE}"
261
+ --use_relative_tolerance
262
+ )
263
+
264
+ if [ -n "${EMBEDDING_MODEL}" ]; then
265
+ execute_args+=(--embedding_model "${EMBEDDING_MODEL}")
266
+ fi
267
+
268
+ python "${EXECUTE_CLI}" "${execute_args[@]}"
269
+ EXIT_CODE=$?
270
+
271
+ if [ ${EXIT_CODE} -ne 0 ]; then
272
+ echo ""
273
+ echo "❌ Evaluation failed for ${DATASET_NAME} with exit code ${EXIT_CODE}"
274
+ return 1
275
+ fi
276
+
277
+ echo ""
278
+ echo "✅ Evaluation completed for ${DATASET_NAME}!"
279
+ echo ""
280
+
281
+ # Show evaluation report if exists
282
+ if [ -f "${EVAL_REPORT}" ]; then
283
+ echo "📊 Evaluation Results for ${DATASET_NAME}:"
284
+ cat "${EVAL_REPORT}" | jq '{
285
+ accuracy: .accuracy,
286
+ correct: .correct,
287
+ total: .total_problems,
288
+ status_counts: .status_counts
289
+ }' 2>/dev/null || cat "${EVAL_REPORT}"
290
+ echo ""
291
+
292
+ # Store results for final summary (with lock for parallel execution)
293
+ ACCURACY=$(cat "${EVAL_REPORT}" | jq -r '.accuracy' 2>/dev/null || echo "N/A")
294
+ CORRECT=$(cat "${EVAL_REPORT}" | jq -r '.correct' 2>/dev/null || echo "N/A")
295
+ TOTAL_PROBS=$(cat "${EVAL_REPORT}" | jq -r '.total_problems' 2>/dev/null || echo "N/A")
296
+
297
+ # Use lock to safely append to results file (fallback to simple append if flock not available)
298
+ RESULTS_LOCK="${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.lock"
299
+ if command -v flock >/dev/null 2>&1; then
300
+ (
301
+ flock -x 200
302
+ echo "${DATASET_NAME}|${ACCURACY}|${CORRECT}|${TOTAL_PROBS}|${EVAL_FILE}" >> "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
303
+ ) 200>"${RESULTS_LOCK}"
304
+ else
305
+ # Fallback: use simple append (may have race condition but unlikely with small writes)
306
+ echo "${DATASET_NAME}|${ACCURACY}|${CORRECT}|${TOTAL_PROBS}|${EVAL_FILE}" >> "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
307
+ fi
308
+ fi
309
+
310
+ echo "================================================"
311
+ echo ""
312
+
313
+ if [ -f "${EVAL_REPORT}" ]; then
314
+ return 0
315
+ else
316
+ return 1
317
+ fi
318
+ }
319
+
320
+ # ============================================
321
+ # Function: Run single dataset (internal, supports logging)
322
+ # ============================================
323
+ run_single_dataset_internal() {
324
+ local DATASET_NAME=$1
325
+ local LOG_FILE=$2
326
+ local STREAM_OUTPUT=${3:-false}
327
+
328
+ if [ "${STREAM_OUTPUT}" = "true" ]; then
329
+ process_dataset "${DATASET_NAME}" |& tee "${LOG_FILE}"
330
+ local EXIT_CODE=${PIPESTATUS[0]}
331
+ return ${EXIT_CODE}
332
+ else
333
+ process_dataset "${DATASET_NAME}" > "${LOG_FILE}" 2>&1
334
+ return $?
335
+ fi
336
+ }
337
+
338
+ # ============================================
339
+ # Function: Run single dataset (wrapper for sequential execution)
340
+ # ============================================
341
+ run_single_dataset() {
342
+ local DATASET_NAME=$1
343
+ local STREAM_OUTPUT=${2:-false}
344
+ local LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
345
+
346
+ run_single_dataset_internal "${DATASET_NAME}" "${LOG_FILE}" "${STREAM_OUTPUT}"
347
+ local EXIT_CODE=$?
348
+
349
+ # Display output only when we did not already stream it live
350
+ if [ "${STREAM_OUTPUT}" != "true" ]; then
351
+ cat "${LOG_FILE}"
352
+ fi
353
+
354
+ return ${EXIT_CODE}
355
+ }
356
+
357
+ # ============================================
358
+ # Main Execution
359
+ # ============================================
360
+
361
+ echo "================================================"
362
+ echo "🚀 Generate + Evaluate Pipeline"
363
+ echo "================================================"
364
+ echo "Model: ${MODEL}"
365
+ echo "Max problems: ${MAX_PROBLEMS}"
366
+ echo "Temperature: ${TEMPERATURE}"
367
+ echo "Memory dir: ${MEMORY_DIR}"
368
+ echo "Memory Top-K: ${MEMORY_TOP_K}"
369
+ if [ -n "${EMBEDDING_MODEL}" ]; then
370
+ echo "Embedding: ${EMBEDDING_MODEL}"
371
+ else
372
+ echo "Embedding: MemoryBank default"
373
+ fi
374
+ echo "Parallel: ${PARALLEL}"
375
+ echo "Refresh Memory: ${REFRESH_DEBUG_MEMORY}"
376
+ echo "Run All Benchmarks: ${RUN_ALL_BENCHMARKS}"
377
+ echo "HF Offline: ${USE_HF_OFFLINE}"
378
+ echo "Parallel Benchmarks: ${PARALLEL_BENCHMARKS}"
379
+ if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
380
+ echo "Max Parallel Jobs: ${MAX_PARALLEL_JOBS}"
381
+ fi
382
+ echo ""
383
+ echo "Eval Workers: ${NUM_WORKERS}"
384
+ echo "Eval Timeout: ${TIMEOUT}s"
385
+ echo "Tolerance: ${TOLERANCE} (relative)"
386
+ echo ""
387
+ echo "Max retries: ${MAX_RETRIES}"
388
+ echo "================================================"
389
+ echo ""
390
+
391
+ # Activate environment
392
+ ensure_or_debate_env || exit 1
393
+
394
+ # Set Hugging Face offline mode if enabled
395
+ if [ "${USE_HF_OFFLINE}" = "true" ]; then
396
+ echo "ℹ️ Hugging Face offline mode enabled (using local cache)"
397
+ export HF_HUB_OFFLINE=1
398
+ export TRANSFORMERS_OFFLINE=1
399
+ export HF_DATASETS_OFFLINE=1
400
+ else
401
+ echo "ℹ️ Hugging Face online mode (may download models if needed)"
402
+ fi
403
+ echo ""
404
+
405
+ # Backup and clear debug memory (only once at the beginning)
406
+ backup_debug_memory
407
+
408
+ # ============================================
409
+ # Run benchmarks
410
+ # ============================================
411
+ if [ "${RUN_ALL_BENCHMARKS}" = "true" ]; then
412
+ if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
413
+ echo "================================================"
414
+ echo "🔄 Running ALL benchmarks in PARALLEL"
415
+ echo "================================================"
416
+ else
417
+ echo "================================================"
418
+ echo "🔄 Running ALL benchmarks SEQUENTIALLY"
419
+ echo "================================================"
420
+ fi
421
+ echo ""
422
+
423
+ # Define benchmark dataset names in specified order (without .jsonl extension)
424
+ # Modify this array to change the execution order
425
+ BENCHMARK_NAMES=(
426
+ "NL4OPT"
427
+ "EasyLP"
428
+ "ComplexLP"
429
+ "NLP4LP"
430
+ "ComplexOR"
431
+ "IndustryOR"
432
+ "ReSocratic"
433
+ "OPT-Principled"
434
+ )
435
+
436
+ # Count total benchmarks
437
+ TOTAL_BENCHMARKS=${#BENCHMARK_NAMES[@]}
438
+ FAILED=0
439
+ SKIPPED=0
440
+
441
+ echo "Total benchmarks to process: ${TOTAL_BENCHMARKS}"
442
+ echo ""
443
+ echo "Execution order:"
444
+ for i in "${!BENCHMARK_NAMES[@]}"; do
445
+ echo " $((i+1)). ${BENCHMARK_NAMES[$i]}"
446
+ done
447
+ echo ""
448
+
449
+ # Initialize batch results file
450
+ echo "Dataset|Accuracy|Correct|Total|Output" > "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
451
+
452
+ # Create lock file for parallel execution
453
+ RESULTS_LOCK="${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.lock"
454
+ touch "${RESULTS_LOCK}"
455
+
456
+ # Process benchmarks (parallel or sequential)
457
+ if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
458
+ # Parallel execution
459
+ declare -a PIDS=()
460
+ declare -a DATASET_NAMES=()
461
+ CURRENT_JOBS=0
462
+
463
+ for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
464
+ BENCHMARK_FILE="${BENCHMARKS_DIR}/${DATASET_NAME}.jsonl"
465
+
466
+ # Check if file exists
467
+ if [ ! -f "${BENCHMARK_FILE}" ]; then
468
+ echo "⚠️ File not found: ${BENCHMARK_FILE}"
469
+ echo " Skipping ${DATASET_NAME}..."
470
+ SKIPPED=$((SKIPPED + 1))
471
+ continue
472
+ fi
473
+
474
+ # Wait for available slot if max jobs reached
475
+ while true; do
476
+ # Count running jobs
477
+ CURRENT_JOBS=0
478
+ for PID in "${PIDS[@]}"; do
479
+ if kill -0 ${PID} 2>/dev/null; then
480
+ CURRENT_JOBS=$((CURRENT_JOBS + 1))
481
+ fi
482
+ done
483
+
484
+ # Break if we have available slots
485
+ if [ ${CURRENT_JOBS} -lt ${MAX_PARALLEL_JOBS} ]; then
486
+ break
487
+ fi
488
+
489
+ # Wait a bit before checking again
490
+ sleep 1
491
+ done
492
+
493
+ # Start job in background
494
+ LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
495
+ echo "🚀 Starting ${DATASET_NAME} (log: ${LOG_FILE})"
496
+
497
+ (
498
+ run_single_dataset_internal "${DATASET_NAME}" "${LOG_FILE}"
499
+ EXIT_CODE=$?
500
+ if [ ${EXIT_CODE} -ne 0 ]; then
501
+ echo "[${DATASET_NAME}] ❌ Failed with exit code ${EXIT_CODE}" >> "${OUTPUT_DIR}/failures_${MAIN_TIMESTAMP}.txt"
502
+ else
503
+ echo "[${DATASET_NAME}] ✅ Completed successfully" >> "${OUTPUT_DIR}/success_${MAIN_TIMESTAMP}.txt"
504
+ fi
505
+ ) &
506
+
507
+ PID=$!
508
+ PIDS+=(${PID})
509
+ DATASET_NAMES+=("${DATASET_NAME}")
510
+ done
511
+
512
+ # Wait for all jobs to complete
513
+ echo ""
514
+ echo "⏳ Waiting for all jobs to complete..."
515
+ echo ""
516
+
517
+ for i in "${!PIDS[@]}"; do
518
+ PID=${PIDS[$i]}
519
+ DATASET_NAME=${DATASET_NAMES[$i]}
520
+ wait ${PID}
521
+ EXIT_CODE=$?
522
+ if [ ${EXIT_CODE} -ne 0 ]; then
523
+ FAILED=$((FAILED + 1))
524
+ echo "⚠️ ${DATASET_NAME} failed with exit code ${EXIT_CODE}"
525
+ fi
526
+ done
527
+
528
+ # Clean up lock file
529
+ rm -f "${RESULTS_LOCK}"
530
+
531
+ echo ""
532
+ echo "================================================"
533
+ echo "📋 Individual Job Logs:"
534
+ echo "================================================"
535
+ for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
536
+ LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
537
+ if [ -f "${LOG_FILE}" ]; then
538
+ echo ""
539
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
540
+ echo " ${DATASET_NAME} - Log File: ${LOG_FILE}"
541
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
542
+ tail -20 "${LOG_FILE}"
543
+ fi
544
+ done
545
+ echo ""
546
+
547
+ else
548
+ # Sequential execution
549
+ CURRENT=0
550
+ for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
551
+ CURRENT=$((CURRENT + 1))
552
+ BENCHMARK_FILE="${BENCHMARKS_DIR}/${DATASET_NAME}.jsonl"
553
+
554
+ echo ""
555
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
556
+ echo " Progress: ${CURRENT}/${TOTAL_BENCHMARKS}"
557
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
558
+
559
+ # Check if file exists
560
+ if [ ! -f "${BENCHMARK_FILE}" ]; then
561
+ echo "⚠️ File not found: ${BENCHMARK_FILE}"
562
+ echo " Skipping..."
563
+ SKIPPED=$((SKIPPED + 1))
564
+ continue
565
+ fi
566
+
567
+ run_single_dataset "${DATASET_NAME}" true
568
+
569
+ if [ $? -ne 0 ]; then
570
+ FAILED=$((FAILED + 1))
571
+ echo "⚠️ Failed to process ${DATASET_NAME}, continuing..."
572
+ fi
573
+
574
+ echo ""
575
+ done
576
+
577
+ # Clean up lock file
578
+ rm -f "${RESULTS_LOCK}"
579
+ fi
580
+
581
+ # ============================================
582
+ # Final Summary for All Benchmarks
583
+ # ============================================
584
+ echo ""
585
+ echo "================================================"
586
+ echo "🎉 All Benchmarks Complete!"
587
+ echo "================================================"
588
+ echo ""
589
+ echo "Summary:"
590
+ echo " Total benchmarks: ${TOTAL_BENCHMARKS}"
591
+ echo " Successful: $((TOTAL_BENCHMARKS - FAILED - SKIPPED))"
592
+ echo " Failed: ${FAILED}"
593
+ echo " Skipped: ${SKIPPED}"
594
+ echo ""
595
+ echo "📊 Detailed Results:"
596
+ echo "================================================"
597
+
598
+ # Display formatted results table
599
+ if [ -f "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt" ]; then
600
+ echo ""
601
+ printf "%-35s | %-10s | %-10s | %-10s\n" "Dataset" "Accuracy" "Correct" "Total"
602
+ echo "--------------------------------------------------------------------------------"
603
+ tail -n +2 "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt" | while IFS='|' read -r dataset accuracy correct total output; do
604
+ printf "%-35s | %-10s | %-10s | %-10s\n" "${dataset}" "${accuracy}" "${correct}" "${total}"
605
+ done
606
+ echo ""
607
+ echo "📁 Full results saved to: ${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
608
+ fi
609
+
610
+ echo ""
611
+ echo "================================================"
612
+
613
+ else
614
+ # Run single dataset mode
615
+ echo "================================================"
616
+ echo "📝 Running single dataset: ${DEFAULT_DATASET}"
617
+ echo "================================================"
618
+ echo ""
619
+
620
+ BENCHMARK_FILE="${BENCHMARKS_DIR}/${DEFAULT_DATASET}.jsonl"
621
+ if [ ! -f "${BENCHMARK_FILE}" ]; then
622
+ echo "❌ Dataset file not found: ${BENCHMARK_FILE}"
623
+ exit 1
624
+ fi
625
+
626
+ run_single_dataset "${DEFAULT_DATASET}" true
627
+
628
+ if [ $? -ne 0 ]; then
629
+ echo ""
630
+ echo "❌ Pipeline failed"
631
+ exit 1
632
+ fi
633
+
634
+ echo ""
635
+ echo "🎉 Pipeline Complete!"
636
+ fi
637
+
638
+ echo ""
639
+ echo "✨ All done! Check the results above."
640
+ echo ""
scripts/run_memory_debate.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Wrapper for debate_memory.run_memory_debate."""
3
+
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
8
+ SRC_DIR = PROJECT_ROOT / "src"
9
+ if str(SRC_DIR) not in sys.path:
10
+ sys.path.insert(0, str(SRC_DIR))
11
+
12
+ from debate_memory.run_memory_debate import main
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
17
+
scripts/test_self_healing_full.sh ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+ PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
7
+ SRC_DIR="${PROJECT_ROOT}/src"
8
+ export PYTHONPATH="${SRC_DIR}:${PYTHONPATH:-}"
9
+ GENERATE_CLI="${PROJECT_ROOT}/scripts/generate_with_memory.py"
10
+
11
+ # Test self-healing mechanism with a small sample
12
+ # This will test the full pipeline with just 3 problems
13
+
14
+ echo "================================================"
15
+ echo "🧪 Testing Self-Healing Mechanism"
16
+ echo "================================================"
17
+ echo ""
18
+
19
+ # Activate conda environment
20
+ source ~/miniconda3/etc/profile.d/conda.sh
21
+ conda activate or-debate
22
+
23
+ # Test parameters
24
+ MODEL="deepseek-chat"
25
+ DATASET="IndustryOR"
26
+ MAX_PROBLEMS=3
27
+ OUTPUT_DIR="${PROJECT_ROOT}/test_output"
28
+ OUTPUT_FILE="${OUTPUT_DIR}/test_self_healing_$(date +%Y%m%d_%H%M%S).jsonl"
29
+ MEMORY_DIR="${PROJECT_ROOT}/memory_storage"
30
+ MAX_RETRIES=3
31
+
32
+ mkdir -p "${OUTPUT_DIR}"
33
+
34
+ echo "Configuration:"
35
+ echo " Model: ${MODEL}"
36
+ echo " Dataset: ${DATASET}"
37
+ echo " Max problems: ${MAX_PROBLEMS}"
38
+ echo " Max retries: ${MAX_RETRIES}"
39
+ echo " Output: ${OUTPUT_FILE}"
40
+ echo ""
41
+
42
+ # Run generation with self-healing
43
+ set +e
44
+ python "${GENERATE_CLI}" \
45
+ --dataset "${DATASET}" \
46
+ --model "${MODEL}" \
47
+ --max_problems "${MAX_PROBLEMS}" \
48
+ --output "${OUTPUT_FILE}" \
49
+ --memory_dir "${MEMORY_DIR}" \
50
+ --memory_top_k 3 \
51
+ --parallel 1 \
52
+ --max_retries "${MAX_RETRIES}" \
53
+ --execution_timeout 60
54
+ EXIT_CODE=$?
55
+ set -e
56
+
57
+
58
+ if [ ${EXIT_CODE} -ne 0 ]; then
59
+ echo ""
60
+ echo "❌ Test failed with exit code ${EXIT_CODE}"
61
+ exit 1
62
+ fi
63
+
64
+ echo ""
65
+ echo "================================================"
66
+ echo "📊 Test Results"
67
+ echo "================================================"
68
+
69
+ if [ -f "${OUTPUT_FILE}" ]; then
70
+ TOTAL=$(wc -l < "${OUTPUT_FILE}")
71
+ echo "Total problems processed: ${TOTAL}"
72
+
73
+ # Count successes
74
+ SUCCESS=$(grep -c '"execution_status": "success"' "${OUTPUT_FILE}" 2>/dev/null || echo 0)
75
+ echo "Successful executions: ${SUCCESS}"
76
+
77
+ # Count with retries
78
+ RETRIED=$(grep -c '"total_attempts": [2-9]' "${OUTPUT_FILE}" 2>/dev/null || echo 0)
79
+ echo "Problems that used retry: ${RETRIED}"
80
+
81
+ # Show sample result
82
+ echo ""
83
+ echo "Sample result (problem 1):"
84
+ head -1 "${OUTPUT_FILE}" | python -m json.tool | grep -E '"id"|"execution_status"|"total_attempts"|"self_healing_enabled"'
85
+
86
+ echo ""
87
+ echo "✅ Test completed successfully!"
88
+ echo "Full results saved to: ${OUTPUT_FILE}"
89
+ else
90
+ echo "❌ Output file not found: ${OUTPUT_FILE}"
91
+ exit 1
92
+ fi
src/debate_memory/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Debate-with-memory v2 core package."""
2
+
3
+ from importlib import metadata
4
+
5
+ try:
6
+ __version__ = metadata.version("debate-memory")
7
+ except metadata.PackageNotFoundError: # pragma: no cover - local usage
8
+ __version__ = "0.0.0"
9
+
10
+ __all__ = ["__version__"]
11
+
src/debate_memory/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (404 Bytes). View file
 
src/debate_memory/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (541 Bytes). View file
 
src/debate_memory/__pycache__/build_memory_from_eval_results.cpython-311.pyc ADDED
Binary file (14.4 kB). View file
 
src/debate_memory/__pycache__/config.cpython-310.pyc ADDED
Binary file (4.82 kB). View file
 
src/debate_memory/__pycache__/config.cpython-311.pyc ADDED
Binary file (6.6 kB). View file
 
src/debate_memory/__pycache__/debate_memory_builder.cpython-311.pyc ADDED
Binary file (23.1 kB). View file
 
src/debate_memory/__pycache__/debug_executor.cpython-310.pyc ADDED
Binary file (3.7 kB). View file
 
src/debate_memory/__pycache__/debug_memory.cpython-310.pyc ADDED
Binary file (5.19 kB). View file
 
src/debate_memory/__pycache__/debug_memory_builder.cpython-311.pyc ADDED
Binary file (8.81 kB). View file
 
src/debate_memory/__pycache__/generate_with_memory.cpython-310.pyc ADDED
Binary file (24 kB). View file
 
src/debate_memory/__pycache__/generate_with_memory.cpython-311.pyc ADDED
Binary file (40.9 kB). View file
 
src/debate_memory/__pycache__/llm.cpython-310.pyc ADDED
Binary file (3.16 kB). View file
 
src/debate_memory/__pycache__/llm.cpython-311.pyc ADDED
Binary file (4.98 kB). View file
 
src/debate_memory/__pycache__/memory_bank.cpython-310.pyc ADDED
Binary file (9.19 kB). View file
 
src/debate_memory/__pycache__/memory_bank.cpython-311.pyc ADDED
Binary file (15.3 kB). View file
 
src/debate_memory/__pycache__/run_memory_debate.cpython-311.pyc ADDED
Binary file (27.6 kB). View file
 
src/debate_memory/augment_memory_from_standalone_runs.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Build non-destructive memory variants from standalone pipeline runs."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import glob
8
+ import hashlib
9
+ import json
10
+ import shutil
11
+ from dataclasses import dataclass
12
+ from datetime import datetime, timezone
13
+ from pathlib import Path
14
+ from typing import Dict, Iterable, List, Optional, Sequence, Tuple
15
+
16
+ from llama_index.core import Document
17
+
18
+ from .memory_bank import MemoryBank
19
+
20
+ SCRIPT_DIR = Path(__file__).resolve().parent
21
+ PROJECT_ROOT = SCRIPT_DIR.parent.parent
22
+ DEFAULT_BASE_ROOT = PROJECT_ROOT
23
+ DEFAULT_VARIANTS_ROOT = PROJECT_ROOT / "memory_variants"
24
+ DEFAULT_STANDALONE_ROOT = Path("/home/datagen/OR-Debate/standalone_pipeline/runs")
25
+
26
+ MAIN_MEMORY_DIRNAME = "memory_storage"
27
+ DEBUG_CASE_MEMORY_DIRNAME = "debug_case_memory"
28
+ DEBATE_MEMORY_DIRNAME = "debate_memory_storage"
29
+ DEBUG_MEMORY_FILENAME = "debug_memory.jsonl"
30
+
31
+ DEBUG_FAILURE_STATUSES = {
32
+ "execution_error",
33
+ "error",
34
+ "timeout",
35
+ "no_code",
36
+ "not_executed",
37
+ "success_no_objective",
38
+ "execution_failed",
39
+ }
40
+
41
+ PROMPT_ARTIFACT_HEADERS = (
42
+ "\n# Retrieved Historical Cases",
43
+ "\n# Debate Memory Insights",
44
+ "\n# Retrieved Debug Guidance",
45
+ )
46
+
47
+
48
+ @dataclass
49
+ class RunArtifacts:
50
+ source_root: Path
51
+ run_dir: Path
52
+ dataset: str
53
+ model_a: str
54
+ model_b: str
55
+ single_generated: Dict[str, Path]
56
+ debate_results: Optional[Path]
57
+ consensus_jsonl: Optional[Path]
58
+ consensus_eval: Optional[Path]
59
+ manifest_path: Optional[Path]
60
+
61
+ @property
62
+ def has_complete_debate(self) -> bool:
63
+ return bool(
64
+ self.debate_results
65
+ and self.consensus_jsonl
66
+ and self.consensus_eval
67
+ and self.debate_results.exists()
68
+ and self.consensus_jsonl.exists()
69
+ and self.consensus_eval.exists()
70
+ )
71
+
72
+
73
+ @dataclass
74
+ class ReferenceSolution:
75
+ source: str
76
+ model: str
77
+ code: str
78
+ objective_value: Optional[float]
79
+ chosen_model: Optional[str]
80
+
81
+
82
+ def now_iso() -> str:
83
+ return datetime.now(timezone.utc).isoformat()
84
+
85
+
86
+ def now_stamp() -> str:
87
+ return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
88
+
89
+
90
+ def load_jsonl(path: Path) -> List[Dict]:
91
+ rows: List[Dict] = []
92
+ if not path or not path.exists():
93
+ return rows
94
+ with path.open("r", encoding="utf-8") as fh:
95
+ for line in fh:
96
+ line = line.strip()
97
+ if not line:
98
+ continue
99
+ try:
100
+ rows.append(json.loads(line))
101
+ except json.JSONDecodeError:
102
+ continue
103
+ return rows
104
+
105
+
106
+ def append_jsonl(path: Path, rows: Iterable[Dict]) -> int:
107
+ count = 0
108
+ with path.open("a", encoding="utf-8") as fh:
109
+ for row in rows:
110
+ fh.write(json.dumps(row, ensure_ascii=False) + "\n")
111
+ count += 1
112
+ return count
113
+
114
+
115
+ def load_json(path: Path) -> Dict:
116
+ if not path.exists():
117
+ return {}
118
+ with path.open("r", encoding="utf-8") as fh:
119
+ return json.load(fh)
120
+
121
+
122
+ def dump_json(path: Path, payload: Dict) -> None:
123
+ path.parent.mkdir(parents=True, exist_ok=True)
124
+ with path.open("w", encoding="utf-8") as fh:
125
+ json.dump(payload, fh, ensure_ascii=False, indent=2, sort_keys=True)
126
+
127
+
128
+ def count_jsonl_lines(path: Path) -> int:
129
+ if not path.exists():
130
+ return 0
131
+ with path.open("r", encoding="utf-8") as fh:
132
+ return sum(1 for _ in fh if _.strip())
133
+
134
+
135
+ def float_or_none(value) -> Optional[float]:
136
+ if value is None:
137
+ return None
138
+ try:
139
+ return float(value)
140
+ except (TypeError, ValueError):
141
+ return None
142
+
143
+
144
+ def infer_models_from_run_name(run_name: str) -> Tuple[str, str]:
145
+ parts = run_name.split("_vs_")
146
+ if len(parts) != 2:
147
+ return "modelA", "modelB"
148
+ left = parts[0].split("_")
149
+ if len(left) < 2:
150
+ return left[-1], parts[1]
151
+ return "_".join(left[1:]), parts[1]
152
+
153
+
154
+ def clean_description(text: str) -> str:
155
+ cleaned = (text or "").strip()
156
+ for header in PROMPT_ARTIFACT_HEADERS:
157
+ pos = cleaned.find(header)
158
+ if pos != -1:
159
+ cleaned = cleaned[:pos].rstrip()
160
+ return cleaned
161
+
162
+
163
+ def check_correctness(
164
+ pred_obj: Optional[float],
165
+ gt_obj: Optional[float],
166
+ tolerance: float,
167
+ use_relative_tolerance: bool,
168
+ ) -> bool:
169
+ if pred_obj is None or gt_obj is None:
170
+ return False
171
+ if gt_obj == 0:
172
+ return abs(pred_obj) <= tolerance
173
+ if use_relative_tolerance:
174
+ return abs((pred_obj - gt_obj) / gt_obj) <= tolerance
175
+ return abs(pred_obj - gt_obj) <= tolerance
176
+
177
+
178
+ def sha1_short(text: str, length: int = 16) -> str:
179
+ return hashlib.sha1(text.encode("utf-8")).hexdigest()[:length]
180
+
181
+
182
+ def build_doc(problem_id: int, description: str, solution_code: str, objective_value: float, metadata: Dict) -> Document:
183
+ doc_text = f"""Problem: {description}
184
+
185
+ Solution approach:
186
+ {solution_code[:500]}...
187
+
188
+ Key features:
189
+ - Problem ID: {problem_id}
190
+ - Objective value: {objective_value}
191
+ - Status: Correct
192
+ """
193
+ return Document(
194
+ text=doc_text,
195
+ metadata={
196
+ "problem_id": problem_id,
197
+ "objective_value": objective_value,
198
+ **metadata,
199
+ },
200
+ )
201
+
202
+
203
+ class BatchMemoryAppender:
204
+ def __init__(self, memory_dir: Path, embedding_model: str) -> None:
205
+ self.memory_dir = memory_dir
206
+ self.bank = MemoryBank(memory_dir=str(memory_dir), embedding_model=embedding_model)
207
+ self.pending_cases: List[Dict] = []
208
+ self.pending_docs: List[Document] = []
209
+
210
+ def add_case(
211
+ self,
212
+ *,
213
+ problem_id: int,
214
+ problem_desc: str,
215
+ solution_code: str,
216
+ objective_value: float,
217
+ metadata: Dict,
218
+ ) -> None:
219
+ case = {
220
+ "problem_id": int(problem_id),
221
+ "description": problem_desc,
222
+ "solution_code": solution_code,
223
+ "objective_value": objective_value,
224
+ "is_correct": True,
225
+ "metadata": metadata,
226
+ }
227
+ self.pending_cases.append(case)
228
+ self.pending_docs.append(
229
+ build_doc(
230
+ problem_id=int(problem_id),
231
+ description=problem_desc,
232
+ solution_code=solution_code,
233
+ objective_value=objective_value,
234
+ metadata=metadata,
235
+ )
236
+ )
237
+
238
+ def finalize(self) -> int:
239
+ if not self.pending_cases:
240
+ return 0
241
+ with Path(self.bank.cases_file).open("a", encoding="utf-8") as fh:
242
+ for case in self.pending_cases:
243
+ fh.write(json.dumps(case, ensure_ascii=False) + "\n")
244
+ for doc in self.pending_docs:
245
+ self.bank.index.insert(doc)
246
+ self.bank.index.storage_context.persist(persist_dir=self.bank.index_dir)
247
+ added = len(self.pending_cases)
248
+ self.pending_cases.clear()
249
+ self.pending_docs.clear()
250
+ return added
251
+
252
+
253
+ def resolve_source_roots(patterns: Sequence[str]) -> List[Path]:
254
+ resolved: List[Path] = []
255
+ for pattern in patterns:
256
+ matches = glob.glob(pattern)
257
+ if matches:
258
+ for match in matches:
259
+ path = Path(match)
260
+ if path.is_dir():
261
+ resolved.append(path.resolve())
262
+ else:
263
+ path = Path(pattern)
264
+ if path.is_dir():
265
+ resolved.append(path.resolve())
266
+ deduped = sorted({path for path in resolved})
267
+ return deduped
268
+
269
+
270
+ def resolve_file(run_dir: Path, raw_value: Optional[str]) -> Optional[Path]:
271
+ if not raw_value:
272
+ return None
273
+ candidate = Path(raw_value)
274
+ if not candidate.is_absolute():
275
+ candidate = run_dir / candidate
276
+ return candidate if candidate.exists() else None
277
+
278
+
279
+ def discover_run_artifacts(source_root: Path) -> List[RunArtifacts]:
280
+ runs: List[RunArtifacts] = []
281
+ if not source_root.exists():
282
+ return runs
283
+
284
+ for run_dir in sorted(source_root.iterdir()):
285
+ if not run_dir.is_dir():
286
+ continue
287
+
288
+ manifest_path = run_dir / "run_manifest.json"
289
+ manifest = load_json(manifest_path) if manifest_path.exists() else {}
290
+
291
+ model_a, model_b = infer_models_from_run_name(run_dir.name)
292
+ model_a = manifest.get("model_a", model_a)
293
+ model_b = manifest.get("model_b", model_b)
294
+ dataset = manifest.get("dataset", source_root.name)
295
+
296
+ single_generated: Dict[str, Path] = {}
297
+ for generated in sorted(run_dir.glob("single/*/generated.jsonl")):
298
+ model_name = generated.parent.name
299
+ single_generated[model_name] = generated
300
+
301
+ model_a_generated = resolve_file(run_dir, manifest.get("model_a_generated"))
302
+ model_b_generated = resolve_file(run_dir, manifest.get("model_b_generated"))
303
+ if model_a_generated:
304
+ single_generated.setdefault(model_a, model_a_generated)
305
+ if model_b_generated:
306
+ single_generated.setdefault(model_b, model_b_generated)
307
+
308
+ debate_results = run_dir / "debate" / "debate_results.jsonl"
309
+ if not debate_results.exists():
310
+ debate_results = resolve_file(run_dir, manifest.get("debate_dir"))
311
+ if debate_results and debate_results.is_dir():
312
+ debate_results = debate_results / "debate_results.jsonl"
313
+ if debate_results and not debate_results.exists():
314
+ debate_results = None
315
+
316
+ consensus_jsonl = resolve_file(run_dir, manifest.get("consensus_jsonl"))
317
+ if consensus_jsonl is None:
318
+ candidates = sorted((run_dir / "debate").glob("consensus_*.jsonl"))
319
+ consensus_jsonl = candidates[0] if candidates else None
320
+
321
+ consensus_eval = run_dir / "consensus_eval" / "evaluation_results.jsonl"
322
+ if not consensus_eval.exists():
323
+ consensus_eval = None
324
+
325
+ runs.append(
326
+ RunArtifacts(
327
+ source_root=source_root,
328
+ run_dir=run_dir,
329
+ dataset=dataset,
330
+ model_a=model_a,
331
+ model_b=model_b,
332
+ single_generated=single_generated,
333
+ debate_results=debate_results,
334
+ consensus_jsonl=consensus_jsonl,
335
+ consensus_eval=consensus_eval,
336
+ manifest_path=manifest_path if manifest_path.exists() else None,
337
+ )
338
+ )
339
+ return runs
340
+
341
+
342
+ def load_existing_case_signatures(cases_file: Path) -> set[str]:
343
+ signatures: set[str] = set()
344
+ if not cases_file.exists():
345
+ return signatures
346
+ with cases_file.open("r", encoding="utf-8") as fh:
347
+ for line in fh:
348
+ line = line.strip()
349
+ if not line:
350
+ continue
351
+ try:
352
+ row = json.loads(line)
353
+ except json.JSONDecodeError:
354
+ continue
355
+ meta = row.get("metadata") or {}
356
+ for key in ("import_signature", "debate_signature"):
357
+ value = meta.get(key)
358
+ if value:
359
+ signatures.add(str(value))
360
+ return signatures
361
+
362
+
363
+ def load_existing_debug_signatures(debug_memory_file: Path) -> set[str]:
364
+ signatures: set[str] = set()
365
+ if not debug_memory_file.exists():
366
+ return signatures
367
+ with debug_memory_file.open("r", encoding="utf-8") as fh:
368
+ for line in fh:
369
+ line = line.strip()
370
+ if not line:
371
+ continue
372
+ try:
373
+ row = json.loads(line)
374
+ except json.JSONDecodeError:
375
+ continue
376
+ signature = row.get("signature")
377
+ if signature:
378
+ signatures.add(str(signature))
379
+ return signatures
380
+
381
+
382
+ def summarize_rounds(rounds: List[Dict], max_chars: int = 1800) -> str:
383
+ if not rounds:
384
+ return ""
385
+ lines: List[str] = []
386
+ for rnd in rounds:
387
+ lines.append(
388
+ f"Round {rnd.get('round')}: "
389
+ f"A={rnd.get('result_A')} ({rnd.get('status_A')}), "
390
+ f"B={rnd.get('result_B')} ({rnd.get('status_B')})"
391
+ )
392
+ analysis_a = (rnd.get("analysis_A") or "").strip()
393
+ analysis_b = (rnd.get("analysis_B") or "").strip()
394
+ if analysis_a:
395
+ lines.append(f"Model A analysis:\n{analysis_a}")
396
+ if analysis_b:
397
+ lines.append(f"Model B analysis:\n{analysis_b}")
398
+ lines.append("")
399
+ text = "\n".join(lines).strip()
400
+ if len(text) <= max_chars:
401
+ return text
402
+ return text[: max_chars - 16] + "\n...\n(truncated)"
403
+
404
+
405
+ def heuristic_debate_summary(entry: Dict, model_a: str, model_b: str) -> Dict:
406
+ initial_a = entry.get("initial_A_result")
407
+ initial_b = entry.get("initial_B_result")
408
+ final_result = entry.get("final_result")
409
+ chosen_model = entry.get("chosen_model") or "consensus"
410
+ rounds = entry.get("debate_rounds") or []
411
+ summary = (
412
+ f"Initial mismatch: {model_a}={initial_a}, {model_b}={initial_b}. "
413
+ f"Debate converged in {len(rounds)} rounds and selected {chosen_model} "
414
+ f"with final objective {final_result}."
415
+ )
416
+ decisive_argument = (
417
+ f"The final candidate from {chosen_model} was retained after both sides "
418
+ "aligned on the same executable outcome."
419
+ )
420
+ guardrails = [
421
+ "Compare feasibility and objective values before rewriting the model.",
422
+ "Keep a stable executable candidate whenever later edits do not improve the result.",
423
+ ]
424
+ return {
425
+ "summary": summary,
426
+ "mismatch_reason": "The two models initially disagreed on the objective value or feasibility.",
427
+ "decisive_argument": decisive_argument,
428
+ "guardrails": guardrails,
429
+ "modeling_patterns": [],
430
+ "history_excerpt": summarize_rounds(rounds),
431
+ }
432
+
433
+
434
+ def guidance_for_status(status: str) -> str:
435
+ status = (status or "").strip()
436
+ if status == "no_code":
437
+ return "Return a complete executable Python program inside a ```python``` block."
438
+ if status == "success_no_objective":
439
+ return "Print the optimized objective explicitly, for example with OBJECTIVE_VALUE after optimize()."
440
+ if status == "timeout":
441
+ return "Reduce model-construction overhead and check whether loops or constraints are exploding combinatorially."
442
+ if status == "not_executed":
443
+ return "Make sure the generated response contains runnable code and that the execution step is actually triggered."
444
+ return "Check imports, indexing, variable names, and model-object references against the traceback."
445
+
446
+
447
+ def has_disagreement(initial_a: Optional[float], initial_b: Optional[float], tolerance: float) -> bool:
448
+ if initial_a is None or initial_b is None:
449
+ return True
450
+ return abs(initial_a - initial_b) > tolerance
451
+
452
+
453
+ def choose_error_text(row: Dict) -> str:
454
+ stderr = (row.get("execution_stderr") or "").strip()
455
+ stdout = (row.get("execution_stdout") or "").strip()
456
+ status = (row.get("execution_status") or row.get("status") or "").strip()
457
+ if stderr:
458
+ return stderr
459
+ if stdout:
460
+ return stdout
461
+ if status == "no_code":
462
+ return "Generated code block is empty."
463
+ if status == "not_executed":
464
+ return "Execution did not complete and no detailed stderr/stdout was recorded."
465
+ if status == "success_no_objective":
466
+ return "Execution succeeded but no objective value could be extracted from stdout."
467
+ return status or "Unknown execution issue."
468
+
469
+
470
+ def clone_base_memory_dirs(base_root: Path, variant_dir: Path) -> Dict[str, Path]:
471
+ mapping = {}
472
+ for dirname in (MAIN_MEMORY_DIRNAME, DEBUG_CASE_MEMORY_DIRNAME, DEBATE_MEMORY_DIRNAME):
473
+ src = base_root / dirname
474
+ dst = variant_dir / dirname
475
+ shutil.copytree(src, dst)
476
+ mapping[dirname] = dst
477
+ return mapping
478
+
479
+
480
+ def main() -> None:
481
+ parser = argparse.ArgumentParser(
482
+ description="Create augmented memory-bank variants from standalone pipeline runs without touching originals."
483
+ )
484
+ parser.add_argument(
485
+ "--variant_name",
486
+ type=str,
487
+ required=True,
488
+ help="Name of the output variant directory under memory_variants/",
489
+ )
490
+ parser.add_argument(
491
+ "--source",
492
+ nargs="+",
493
+ required=True,
494
+ help="Source directories or glob patterns under standalone_pipeline/runs.",
495
+ )
496
+ parser.add_argument(
497
+ "--base_root",
498
+ type=str,
499
+ default=str(DEFAULT_BASE_ROOT),
500
+ help="Project root that contains memory_storage/debug_case_memory/debate_memory_storage.",
501
+ )
502
+ parser.add_argument(
503
+ "--variants_root",
504
+ type=str,
505
+ default=str(DEFAULT_VARIANTS_ROOT),
506
+ help="Directory under which new variants are created.",
507
+ )
508
+ parser.add_argument(
509
+ "--embedding_model",
510
+ type=str,
511
+ default="BAAI/bge-small-en-v1.5",
512
+ help="Embedding model name or local path used when updating vector indexes.",
513
+ )
514
+ parser.add_argument(
515
+ "--tolerance",
516
+ type=float,
517
+ default=0.05,
518
+ help="Correctness tolerance for imported single-model cases.",
519
+ )
520
+ parser.add_argument(
521
+ "--mismatch_tolerance",
522
+ type=float,
523
+ default=1e-3,
524
+ help="Minimum difference between initial debate results to count as a disagreement.",
525
+ )
526
+ parser.add_argument(
527
+ "--use_relative_tolerance",
528
+ action="store_true",
529
+ help="Use relative tolerance when judging single-model correctness.",
530
+ )
531
+ args = parser.parse_args()
532
+
533
+ base_root = Path(args.base_root).resolve()
534
+ variants_root = Path(args.variants_root).resolve()
535
+ source_roots = resolve_source_roots(args.source)
536
+ if not source_roots:
537
+ raise FileNotFoundError(f"No source roots matched: {args.source}")
538
+
539
+ variant_dir = variants_root / args.variant_name
540
+ if variant_dir.exists():
541
+ raise FileExistsError(f"Variant already exists: {variant_dir}")
542
+ variant_dir.parent.mkdir(parents=True, exist_ok=True)
543
+
544
+ print("=== Augment Standalone Memory Banks ===")
545
+ print(f"Base root: {base_root}")
546
+ print(f"Variant dir: {variant_dir}")
547
+ print(f"Source roots: {len(source_roots)}")
548
+ for root in source_roots:
549
+ print(f" - {root}")
550
+
551
+ memory_dirs = clone_base_memory_dirs(base_root, variant_dir)
552
+
553
+ main_memory_dir = memory_dirs[MAIN_MEMORY_DIRNAME]
554
+ debug_case_memory_dir = memory_dirs[DEBUG_CASE_MEMORY_DIRNAME]
555
+ debate_memory_dir = memory_dirs[DEBATE_MEMORY_DIRNAME]
556
+ debug_memory_file = main_memory_dir / DEBUG_MEMORY_FILENAME
557
+
558
+ main_seen = load_existing_case_signatures(main_memory_dir / "cases.jsonl")
559
+ debug_case_seen = load_existing_case_signatures(debug_case_memory_dir / "cases.jsonl")
560
+ debate_seen = load_existing_case_signatures(debate_memory_dir / "cases.jsonl")
561
+ debug_raw_seen = load_existing_debug_signatures(debug_memory_file)
562
+
563
+ main_appender = BatchMemoryAppender(main_memory_dir, args.embedding_model)
564
+ debug_case_appender = BatchMemoryAppender(debug_case_memory_dir, args.embedding_model)
565
+ debate_appender = BatchMemoryAppender(debate_memory_dir, args.embedding_model)
566
+ pending_debug_rows: List[Dict] = []
567
+
568
+ stats = {
569
+ "runs": {
570
+ "source_roots": len(source_roots),
571
+ "runs_discovered": 0,
572
+ "runs_with_manifest": 0,
573
+ "runs_with_complete_debate": 0,
574
+ "runs_partial_or_single_only": 0,
575
+ },
576
+ "memory_storage": {
577
+ "single_correct_added": 0,
578
+ "consensus_correct_added": 0,
579
+ "duplicates_skipped": 0,
580
+ "incorrect_or_missing_single_skipped": 0,
581
+ "consensus_missing_code_or_eval_skipped": 0,
582
+ },
583
+ "debug_memory": {
584
+ "raw_records_added": 0,
585
+ "case_records_added": 0,
586
+ "duplicates_skipped": 0,
587
+ "non_failure_skipped": 0,
588
+ "missing_reference_skipped": 0,
589
+ },
590
+ "debate_memory": {
591
+ "added": 0,
592
+ "duplicates_skipped": 0,
593
+ "missing_or_incorrect_skipped": 0,
594
+ },
595
+ }
596
+
597
+ all_runs: List[RunArtifacts] = []
598
+ for source_root in source_roots:
599
+ all_runs.extend(discover_run_artifacts(source_root))
600
+
601
+ stats["runs"]["runs_discovered"] = len(all_runs)
602
+ stats["runs"]["runs_with_manifest"] = sum(1 for run in all_runs if run.manifest_path)
603
+ stats["runs"]["runs_with_complete_debate"] = sum(1 for run in all_runs if run.has_complete_debate)
604
+ stats["runs"]["runs_partial_or_single_only"] = stats["runs"]["runs_discovered"] - stats["runs"]["runs_with_complete_debate"]
605
+
606
+ for run in all_runs:
607
+ print(f"Processing run: {run.run_dir}")
608
+
609
+ single_rows_by_model: Dict[str, Dict[int, Dict]] = {}
610
+ correct_single_refs: Dict[int, Dict[str, ReferenceSolution]] = {}
611
+
612
+ for model_name, generated_path in sorted(run.single_generated.items()):
613
+ rows_map: Dict[int, Dict] = {}
614
+ for row in load_jsonl(generated_path):
615
+ problem_id = row.get("id")
616
+ if problem_id is None:
617
+ continue
618
+ try:
619
+ problem_id = int(problem_id)
620
+ except (TypeError, ValueError):
621
+ continue
622
+ rows_map[problem_id] = row
623
+
624
+ code = (row.get("generated_code") or "").strip()
625
+ pred = float_or_none(row.get("execution_objective_value"))
626
+ gt = float_or_none(row.get("answer"))
627
+ is_correct = bool(code) and check_correctness(
628
+ pred,
629
+ gt,
630
+ tolerance=args.tolerance,
631
+ use_relative_tolerance=args.use_relative_tolerance,
632
+ )
633
+ if not is_correct:
634
+ stats["memory_storage"]["incorrect_or_missing_single_skipped"] += 1
635
+ continue
636
+
637
+ description = clean_description(row.get("description", ""))
638
+ signature_basis = (
639
+ f"main|single|{run.dataset}|{problem_id}|{model_name}|"
640
+ f"{sha1_short(code, 20)}|{pred}"
641
+ )
642
+ import_signature = f"standalone-main:{sha1_short(signature_basis, 20)}"
643
+ if import_signature in main_seen:
644
+ stats["memory_storage"]["duplicates_skipped"] += 1
645
+ continue
646
+
647
+ metadata = {
648
+ "source": "standalone_single_generated",
649
+ "dataset": run.dataset,
650
+ "run_dir": str(run.run_dir),
651
+ "run_name": run.run_dir.name,
652
+ "source_root": str(run.source_root),
653
+ "model": model_name,
654
+ "execution_status": row.get("execution_status", "unknown"),
655
+ "ground_truth": row.get("answer"),
656
+ "case_kind": "single",
657
+ "import_signature": import_signature,
658
+ }
659
+ main_appender.add_case(
660
+ problem_id=problem_id,
661
+ problem_desc=description,
662
+ solution_code=code,
663
+ objective_value=pred if pred is not None else 0.0,
664
+ metadata=metadata,
665
+ )
666
+ main_seen.add(import_signature)
667
+ stats["memory_storage"]["single_correct_added"] += 1
668
+ correct_single_refs.setdefault(problem_id, {})[model_name] = ReferenceSolution(
669
+ source="single",
670
+ model=model_name,
671
+ code=code,
672
+ objective_value=pred,
673
+ chosen_model=model_name,
674
+ )
675
+
676
+ single_rows_by_model[model_name] = rows_map
677
+
678
+ consensus_rows_by_id: Dict[int, Dict] = {}
679
+ debate_rows_by_id: Dict[int, Dict] = {}
680
+ eval_rows_by_id: Dict[int, Dict] = {}
681
+ consensus_refs: Dict[int, ReferenceSolution] = {}
682
+
683
+ if run.has_complete_debate:
684
+ for row in load_jsonl(run.consensus_jsonl):
685
+ problem_id = row.get("id")
686
+ if problem_id is None:
687
+ continue
688
+ try:
689
+ consensus_rows_by_id[int(problem_id)] = row
690
+ except (TypeError, ValueError):
691
+ continue
692
+ for row in load_jsonl(run.debate_results):
693
+ problem_id = row.get("problem_id")
694
+ if problem_id is None:
695
+ continue
696
+ try:
697
+ debate_rows_by_id[int(problem_id)] = row
698
+ except (TypeError, ValueError):
699
+ continue
700
+ for row in load_jsonl(run.consensus_eval):
701
+ problem_id = row.get("id")
702
+ if problem_id is None:
703
+ continue
704
+ try:
705
+ eval_rows_by_id[int(problem_id)] = row
706
+ except (TypeError, ValueError):
707
+ continue
708
+
709
+ for problem_id, eval_row in eval_rows_by_id.items():
710
+ if not eval_row.get("is_correct", False):
711
+ stats["memory_storage"]["consensus_missing_code_or_eval_skipped"] += 1
712
+ continue
713
+
714
+ consensus_row = consensus_rows_by_id.get(problem_id, {})
715
+ debate_row = debate_rows_by_id.get(problem_id, {})
716
+ code = (consensus_row.get("generated_code") or debate_row.get("final_code") or "").strip()
717
+ if not code:
718
+ stats["memory_storage"]["consensus_missing_code_or_eval_skipped"] += 1
719
+ continue
720
+
721
+ description = clean_description(
722
+ consensus_row.get("description")
723
+ or next(
724
+ (
725
+ model_rows[problem_id].get("description")
726
+ for model_rows in single_rows_by_model.values()
727
+ if problem_id in model_rows
728
+ ),
729
+ f"{run.dataset} problem {problem_id}",
730
+ )
731
+ )
732
+ pred = float_or_none(eval_row.get("predicted_objective"))
733
+ signature_basis = (
734
+ f"main|consensus|{run.dataset}|{problem_id}|"
735
+ f"{sha1_short(code, 20)}|{pred}"
736
+ )
737
+ import_signature = f"standalone-main:{sha1_short(signature_basis, 20)}"
738
+ if import_signature in main_seen:
739
+ stats["memory_storage"]["duplicates_skipped"] += 1
740
+ else:
741
+ metadata = {
742
+ "source": "standalone_consensus_eval",
743
+ "dataset": run.dataset,
744
+ "run_dir": str(run.run_dir),
745
+ "run_name": run.run_dir.name,
746
+ "source_root": str(run.source_root),
747
+ "modelA": run.model_a,
748
+ "modelB": run.model_b,
749
+ "chosen_model": debate_row.get("chosen_model") or consensus_row.get("chosen_model"),
750
+ "execution_status": eval_row.get("execution_status", "unknown"),
751
+ "ground_truth": eval_row.get("ground_truth"),
752
+ "case_kind": "consensus",
753
+ "import_signature": import_signature,
754
+ }
755
+ main_appender.add_case(
756
+ problem_id=problem_id,
757
+ problem_desc=description,
758
+ solution_code=code,
759
+ objective_value=pred if pred is not None else 0.0,
760
+ metadata=metadata,
761
+ )
762
+ main_seen.add(import_signature)
763
+ stats["memory_storage"]["consensus_correct_added"] += 1
764
+
765
+ consensus_refs[problem_id] = ReferenceSolution(
766
+ source="consensus",
767
+ model="debate_consensus",
768
+ code=code,
769
+ objective_value=pred,
770
+ chosen_model=debate_row.get("chosen_model") or consensus_row.get("chosen_model"),
771
+ )
772
+
773
+ for problem_id, debate_row in debate_rows_by_id.items():
774
+ eval_row = eval_rows_by_id.get(problem_id)
775
+ if not eval_row or not eval_row.get("is_correct", False):
776
+ stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
777
+ continue
778
+ if not debate_row.get("converged"):
779
+ stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
780
+ continue
781
+ initial_a = float_or_none(debate_row.get("initial_A_result"))
782
+ initial_b = float_or_none(debate_row.get("initial_B_result"))
783
+ if not has_disagreement(initial_a, initial_b, args.mismatch_tolerance):
784
+ stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
785
+ continue
786
+
787
+ final_code = (debate_row.get("final_code") or "").strip()
788
+ if not final_code:
789
+ stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
790
+ continue
791
+
792
+ base_desc = clean_description(
793
+ consensus_rows_by_id.get(problem_id, {}).get("description")
794
+ or next(
795
+ (
796
+ model_rows[problem_id].get("description")
797
+ for model_rows in single_rows_by_model.values()
798
+ if problem_id in model_rows
799
+ ),
800
+ f"{run.dataset} problem {problem_id}",
801
+ )
802
+ )
803
+ summary_payload = heuristic_debate_summary(debate_row, run.model_a, run.model_b)
804
+ full_desc = (
805
+ f"{base_desc}\n\n# Debate Memory Summary\n"
806
+ f"{summary_payload.get('summary', '').strip()}"
807
+ ).strip()
808
+ debate_signature = (
809
+ f"standalone-debate:{run.dataset}:{problem_id}:{sha1_short(final_code, 20)}"
810
+ )
811
+ if debate_signature in debate_seen:
812
+ stats["debate_memory"]["duplicates_skipped"] += 1
813
+ continue
814
+
815
+ metadata = {
816
+ "source": "standalone_debate_memory_import",
817
+ "dataset": run.dataset,
818
+ "run_dir": str(run.run_dir),
819
+ "run_name": run.run_dir.name,
820
+ "source_root": str(run.source_root),
821
+ "modelA": run.model_a,
822
+ "modelB": run.model_b,
823
+ "initial_A_result": initial_a,
824
+ "initial_B_result": initial_b,
825
+ "ground_truth": eval_row.get("ground_truth"),
826
+ "debate_signature": debate_signature,
827
+ "import_signature": debate_signature,
828
+ "summary": summary_payload,
829
+ }
830
+ debate_appender.add_case(
831
+ problem_id=problem_id,
832
+ problem_desc=full_desc,
833
+ solution_code=final_code,
834
+ objective_value=float_or_none(debate_row.get("final_result")) or 0.0,
835
+ metadata=metadata,
836
+ )
837
+ debate_seen.add(debate_signature)
838
+ stats["debate_memory"]["added"] += 1
839
+
840
+ for model_name, rows_map in sorted(single_rows_by_model.items()):
841
+ for problem_id, row in rows_map.items():
842
+ status = row.get("execution_status") or row.get("status") or ""
843
+ if status not in DEBUG_FAILURE_STATUSES:
844
+ stats["debug_memory"]["non_failure_skipped"] += 1
845
+ continue
846
+
847
+ reference: Optional[ReferenceSolution] = None
848
+ for other_model, ref in sorted(correct_single_refs.get(problem_id, {}).items()):
849
+ if other_model != model_name:
850
+ reference = ref
851
+ break
852
+ if reference is None:
853
+ reference = consensus_refs.get(problem_id)
854
+ if reference is None:
855
+ stats["debug_memory"]["missing_reference_skipped"] += 1
856
+ continue
857
+
858
+ description = clean_description(row.get("description", ""))
859
+ error_text = choose_error_text(row)
860
+ guidance = (
861
+ f"{guidance_for_status(status)} "
862
+ f"Reference fix source: {reference.source} ({reference.model}); "
863
+ f"target objective: {reference.objective_value}."
864
+ )
865
+ import_signature = (
866
+ f"standalone-debug:{sha1_short(f'{run.dataset}|{problem_id}|{model_name}|{status}|{error_text}|{sha1_short(reference.code, 16)}', 20)}"
867
+ )
868
+ if import_signature in debug_case_seen or import_signature in debug_raw_seen:
869
+ stats["debug_memory"]["duplicates_skipped"] += 1
870
+ continue
871
+
872
+ debug_record = {
873
+ "signature": import_signature,
874
+ "status": status,
875
+ "error_text": error_text,
876
+ "guidance": guidance,
877
+ "problem_id": problem_id,
878
+ "description": description,
879
+ "metadata": {
880
+ "source": "standalone_runs.synthetic_debug_case",
881
+ "dataset": run.dataset,
882
+ "run_dir": str(run.run_dir),
883
+ "run_name": run.run_dir.name,
884
+ "source_root": str(run.source_root),
885
+ "model": model_name,
886
+ "reference_source": reference.source,
887
+ "reference_model": reference.model,
888
+ "reference_objective": reference.objective_value,
889
+ "reference_chosen_model": reference.chosen_model,
890
+ },
891
+ "timestamp": now_iso(),
892
+ }
893
+ pending_debug_rows.append(debug_record)
894
+ debug_raw_seen.add(import_signature)
895
+
896
+ prompt_desc = (
897
+ f"{description}\n\n"
898
+ f"## Error Details\n```\n{error_text}\n```\n"
899
+ f"## Guidance\n{guidance}\n"
900
+ )
901
+ reference_code = reference.code.strip()
902
+ solution_code = (
903
+ "# Synthetic Debug Memory Case\n"
904
+ f"# Signature: {import_signature}\n"
905
+ f"# Status: {status}\n"
906
+ f"# Reference source: {reference.source} ({reference.model})\n\n"
907
+ f"{reference_code}"
908
+ )
909
+ metadata = {
910
+ "source": "standalone_runs.synthetic_debug_case",
911
+ "dataset": run.dataset,
912
+ "run_dir": str(run.run_dir),
913
+ "run_name": run.run_dir.name,
914
+ "source_root": str(run.source_root),
915
+ "model": model_name,
916
+ "status": status,
917
+ "signature": import_signature,
918
+ "reference_source": reference.source,
919
+ "reference_model": reference.model,
920
+ "reference_objective": reference.objective_value,
921
+ "reference_chosen_model": reference.chosen_model,
922
+ "import_signature": import_signature,
923
+ }
924
+ debug_case_appender.add_case(
925
+ problem_id=problem_id,
926
+ problem_desc=prompt_desc,
927
+ solution_code=solution_code,
928
+ objective_value=0.0,
929
+ metadata=metadata,
930
+ )
931
+ debug_case_seen.add(import_signature)
932
+ stats["debug_memory"]["raw_records_added"] += 1
933
+ stats["debug_memory"]["case_records_added"] += 1
934
+
935
+ append_jsonl(debug_memory_file, pending_debug_rows)
936
+
937
+ main_added = main_appender.finalize()
938
+ debug_case_added = debug_case_appender.finalize()
939
+ debate_added = debate_appender.finalize()
940
+
941
+ summary = {
942
+ "created_at": now_iso(),
943
+ "variant_dir": str(variant_dir),
944
+ "base_root": str(base_root),
945
+ "source_patterns": list(args.source),
946
+ "resolved_source_roots": [str(path) for path in source_roots],
947
+ "embedding_model": args.embedding_model,
948
+ "tolerance": args.tolerance,
949
+ "use_relative_tolerance": args.use_relative_tolerance,
950
+ "mismatch_tolerance": args.mismatch_tolerance,
951
+ "stats": stats,
952
+ "final_counts": {
953
+ "memory_storage_cases": count_jsonl_lines(main_memory_dir / "cases.jsonl"),
954
+ "debug_memory_records": count_jsonl_lines(debug_memory_file),
955
+ "debug_case_memory_cases": count_jsonl_lines(debug_case_memory_dir / "cases.jsonl"),
956
+ "debate_memory_cases": count_jsonl_lines(debate_memory_dir / "cases.jsonl"),
957
+ "main_added_persisted": main_added,
958
+ "debug_case_added_persisted": debug_case_added,
959
+ "debate_added_persisted": debate_added,
960
+ },
961
+ }
962
+ dump_json(variant_dir / "import_summary.json", summary)
963
+
964
+ print("=== Import Complete ===")
965
+ print(f"Variant: {variant_dir}")
966
+ print(f"Main memory added: {main_added}")
967
+ print(f"Debug raw added: {len(pending_debug_rows)}")
968
+ print(f"Debug case added: {debug_case_added}")
969
+ print(f"Debate memory added: {debate_added}")
970
+ print(f"Summary: {variant_dir / 'import_summary.json'}")
971
+
972
+
973
+ if __name__ == "__main__":
974
+ main()
src/debate_memory/build_memory_from_eval_results.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Build solution memory from evaluation result directories.
4
+
5
+ Any evaluation directory can be used as input as long as it contains both
6
+ `evaluation_results.jsonl` and a `code/` directory. The script extracts problem
7
+ descriptions, executable code, and objective values from correct cases and
8
+ writes them into the solution-memory store.
9
+ """
10
+
11
+ import json
12
+ import os
13
+ import sys
14
+ from pathlib import Path
15
+ from typing import Dict, List, Optional
16
+
17
+ from .config import find_benchmark_path, get_benchmark_dirs, normalize_dataset_name
18
+ from .memory_bank import MemoryBank
19
+
20
+ SCRIPT_DIR = Path(__file__).resolve().parent
21
+ PROJECT_ROOT = SCRIPT_DIR.parent.parent
22
+ DEFAULT_BENCHMARKS_DIR = get_benchmark_dirs(PROJECT_ROOT)[0]
23
+
24
+
25
+ def load_evaluation_results(eval_file: str) -> Dict[int, Dict]:
26
+ """Load evaluation results as `{id: {...}}`."""
27
+ results = {}
28
+ if not os.path.exists(eval_file):
29
+ print(f"Warning: evaluation result file not found: {eval_file}")
30
+ return results
31
+
32
+ with open(eval_file, 'r', encoding='utf-8') as f:
33
+ for line in f:
34
+ if line.strip():
35
+ data = json.loads(line)
36
+ results[data['id']] = data
37
+ return results
38
+
39
+
40
+ def load_benchmark_data(benchmark_file: str) -> Dict[int, Dict]:
41
+ """Load benchmark data as `{id: {...}}`."""
42
+ data = {}
43
+ if not os.path.exists(benchmark_file):
44
+ print(f"Warning: benchmark file not found: {benchmark_file}")
45
+ return data
46
+
47
+ with open(benchmark_file, 'r', encoding='utf-8') as f:
48
+ for idx, line in enumerate(f):
49
+ if line.strip():
50
+ item = json.loads(line)
51
+ # Prefer an explicit id field, otherwise fall back to the line index.
52
+ problem_id = item.get('id', item.get('problem_id', idx))
53
+ data[problem_id] = item
54
+ return data
55
+
56
+
57
+ def load_solution_code(code_file: str) -> Optional[str]:
58
+ """Load a solution code file."""
59
+ if not os.path.exists(code_file):
60
+ return None
61
+
62
+ try:
63
+ with open(code_file, 'r', encoding='utf-8') as f:
64
+ return f.read()
65
+ except Exception as e:
66
+ print(f"Warning: failed to read code file {code_file}: {e}")
67
+ return None
68
+
69
+
70
+ def extract_dataset_name(eval_dir: str) -> Optional[str]:
71
+ """
72
+ Extract the dataset name from an evaluation directory name.
73
+
74
+ Example:
75
+ `deepseek-chat_EasyLP_clean_eval_20251024_120712.jsonl` -> `EasyLP`
76
+ """
77
+ dir_name = os.path.basename(eval_dir)
78
+ # Remove the .jsonl suffix if present.
79
+ if dir_name.endswith('.jsonl'):
80
+ dir_name = dir_name[:-6]
81
+
82
+ # Remove the model name and timestamp.
83
+ parts = dir_name.split('_')
84
+ # Locate the `eval` marker.
85
+ try:
86
+ eval_idx = parts.index('eval')
87
+ # The dataset name should appear before `eval`, after the model name.
88
+ dataset_parts = parts[:eval_idx]
89
+ if len(dataset_parts) > 1:
90
+ return normalize_dataset_name('_'.join(dataset_parts[1:]))
91
+ else:
92
+ return normalize_dataset_name(dataset_parts[0]) if dataset_parts else None
93
+ except ValueError:
94
+ # Fallback for names of the form model_dataset_timestamp.
95
+ if len(parts) >= 3:
96
+ return normalize_dataset_name('_'.join(parts[1:-1]))
97
+ return None
98
+
99
+
100
+ def build_memory_from_eval_result(eval_result_dir: str, benchmarks_dir: str, memory_bank: MemoryBank):
101
+ """
102
+ Build memory from a single evaluation result directory.
103
+
104
+ Args:
105
+ eval_result_dir: Directory containing `evaluation_results.jsonl` and `code/`.
106
+ benchmarks_dir: Benchmark dataset directory.
107
+ memory_bank: MemoryBank instance.
108
+ """
109
+ eval_file = os.path.join(eval_result_dir, 'evaluation_results.jsonl')
110
+ code_dir = os.path.join(eval_result_dir, 'code')
111
+
112
+ if not os.path.exists(eval_file):
113
+ print(f"Warning: skipping {eval_result_dir}: evaluation_results.jsonl not found")
114
+ return 0, 0
115
+
116
+ # Extract the dataset name.
117
+ dataset_name = extract_dataset_name(eval_result_dir)
118
+ if not dataset_name:
119
+ print(f"Warning: skipping {eval_result_dir}: failed to extract dataset name")
120
+ return 0, 0
121
+
122
+ benchmark_file = os.path.join(benchmarks_dir, f"{dataset_name}.jsonl")
123
+ if not os.path.exists(benchmark_file):
124
+ try:
125
+ benchmark_file = str(find_benchmark_path(PROJECT_ROOT, dataset_name))
126
+ except FileNotFoundError:
127
+ pass
128
+ if not os.path.exists(benchmark_file):
129
+ print(f"Warning: skipping {eval_result_dir}: benchmark file not found: {benchmark_file}")
130
+ return 0, 0
131
+
132
+ print(f"Processing dataset: {dataset_name}")
133
+ print(f" evaluation results: {eval_file}")
134
+ print(f" benchmark file: {benchmark_file}")
135
+ print(f" code directory: {code_dir}")
136
+
137
+ # Load all required inputs.
138
+ eval_results = load_evaluation_results(eval_file)
139
+ benchmark_data = load_benchmark_data(benchmark_file)
140
+
141
+ added_count = 0
142
+ skipped_count = 0
143
+
144
+ # Process each correct case.
145
+ for problem_id, eval_result in eval_results.items():
146
+ # Only keep correct cases.
147
+ if not eval_result.get('is_correct', False):
148
+ skipped_count += 1
149
+ continue
150
+
151
+ # Recover the problem description.
152
+ if problem_id not in benchmark_data:
153
+ print(f" Warning: skipping ID {problem_id}: missing from benchmark file")
154
+ skipped_count += 1
155
+ continue
156
+
157
+ benchmark_item = benchmark_data[problem_id]
158
+ # Support both `description` and `en_question`.
159
+ description = benchmark_item.get('description', '') or benchmark_item.get('en_question', '')
160
+
161
+ if not description:
162
+ print(f" Warning: skipping ID {problem_id}: missing problem description")
163
+ skipped_count += 1
164
+ continue
165
+
166
+ # Load the solution code.
167
+ code_file = os.path.join(code_dir, f"problem_{problem_id}.py")
168
+ solution_code = load_solution_code(code_file)
169
+
170
+ if not solution_code:
171
+ print(f" Warning: skipping ID {problem_id}: code file missing or unreadable")
172
+ skipped_count += 1
173
+ continue
174
+
175
+ # Recover the objective value.
176
+ objective_value = eval_result.get('predicted_objective')
177
+ if objective_value is None:
178
+ # Fall back to the benchmark answer fields if needed.
179
+ answer_str = benchmark_item.get('answer', '') or benchmark_item.get('en_answer', '')
180
+ try:
181
+ objective_value = float(answer_str)
182
+ except:
183
+ print(f" Warning: skipping ID {problem_id}: objective value unavailable")
184
+ skipped_count += 1
185
+ continue
186
+
187
+ # Build metadata for the stored case.
188
+ ground_truth = benchmark_item.get('answer', '') or benchmark_item.get('en_answer', '')
189
+ metadata = {
190
+ 'source': 'eval_results',
191
+ 'dataset': dataset_name,
192
+ 'eval_dir': os.path.basename(eval_result_dir),
193
+ 'execution_status': eval_result.get('execution_status', 'unknown'),
194
+ 'ground_truth': ground_truth,
195
+ }
196
+
197
+ # Do not deduplicate across datasets; the same problem_id may appear in multiple benchmarks.
198
+
199
+ # Add the case to the memory bank.
200
+ try:
201
+ memory_bank.add_case(
202
+ problem_id=problem_id,
203
+ problem_desc=description,
204
+ solution_code=solution_code,
205
+ objective_value=float(objective_value),
206
+ is_correct=True,
207
+ metadata=metadata
208
+ )
209
+ added_count += 1
210
+ except Exception as e:
211
+ print(f" Error: failed to add ID {problem_id}: {e}")
212
+ skipped_count += 1
213
+
214
+ print(f" added cases: {added_count}")
215
+ print(f" skipped cases: {skipped_count}")
216
+ print()
217
+
218
+ return added_count, skipped_count
219
+
220
+
221
+ def main():
222
+ import argparse
223
+
224
+ parser = argparse.ArgumentParser(description="Build a memory bank from evaluation results")
225
+ parser.add_argument('--eval_dirs', type=str, nargs='+', required=True,
226
+ help='Evaluation result directories containing evaluation_results.jsonl and code/')
227
+ parser.add_argument('--benchmarks_dir', type=str,
228
+ default=str(DEFAULT_BENCHMARKS_DIR),
229
+ help='Benchmark dataset directory')
230
+ parser.add_argument('--memory_dir', type=str,
231
+ default=str(PROJECT_ROOT / "memory_storage"),
232
+ help='Memory storage directory')
233
+ parser.add_argument('--clear', action='store_true',
234
+ help='Clear the existing memory store before building')
235
+
236
+ args = parser.parse_args()
237
+
238
+ # Validate input directories.
239
+ if not os.path.exists(args.benchmarks_dir):
240
+ print(f"Error: benchmark directory does not exist: {args.benchmarks_dir}")
241
+ sys.exit(1)
242
+
243
+ # Clear the memory store if requested.
244
+ if args.clear:
245
+ if os.path.exists(args.memory_dir):
246
+ import shutil
247
+ print(f"Clearing existing memory store: {args.memory_dir}")
248
+ shutil.rmtree(args.memory_dir)
249
+ print()
250
+
251
+ # Initialize the memory bank.
252
+ print("="*70)
253
+ print("Building Memory Bank from Evaluation Results")
254
+ print("="*70)
255
+ print()
256
+
257
+ memory_bank = MemoryBank(memory_dir=args.memory_dir)
258
+ print(f"Current memory size: {memory_bank.case_count} cases")
259
+ print()
260
+
261
+ # Process each evaluation directory.
262
+ total_added = 0
263
+ total_skipped = 0
264
+
265
+ for eval_dir in args.eval_dirs:
266
+ if not os.path.exists(eval_dir):
267
+ print(f"Warning: skipping missing directory: {eval_dir}")
268
+ continue
269
+
270
+ added, skipped = build_memory_from_eval_result(
271
+ eval_dir, args.benchmarks_dir, memory_bank
272
+ )
273
+ total_added += added
274
+ total_skipped += skipped
275
+
276
+ # Refresh the case count.
277
+ memory_bank.case_count = memory_bank._count_cases()
278
+
279
+ print("="*70)
280
+ print("Memory Bank Build Complete")
281
+ print("="*70)
282
+ print(f"Total added: {total_added} cases")
283
+ print(f"Total skipped: {total_skipped} cases")
284
+ print(f"Final memory size: {memory_bank.case_count} cases")
285
+ print()
286
+ print(f"Memory location: {args.memory_dir}")
287
+ print(f" - cases.jsonl: {os.path.join(args.memory_dir, 'cases.jsonl')}")
288
+ print(f" - index/: {os.path.join(args.memory_dir, 'index')}")
289
+ print("="*70)
290
+
291
+
292
+ if __name__ == "__main__":
293
+ main()
src/debate_memory/config.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration file for simple RAG evaluation
3
+ Contains prompt templates and other settings
4
+ """
5
+
6
+ from pathlib import Path
7
+
8
+ # ============================================
9
+ # Prompt Templates
10
+ # ============================================
11
+
12
+ # Default Gurobi prompt template
13
+ GUROBI_PROMPT = {
14
+ "system": """You are a helpful Assistant with expertise in mathematical modeling and the Gurobi solver. When the User provides an OR question, you will analyze it, build a detailed mathematical model, and provide the Gurobi code to solve it.
15
+
16
+ Your response should follow these steps:
17
+ 1. Carefully analyze the problem to identify decision variables, objective, and constraints.
18
+
19
+ 2. Develop a complete mathematical model, explicitly defining:
20
+ - Sets
21
+ - Parameters
22
+ - Decision Variables (and their types)
23
+ - Objective Function
24
+ - Constraints
25
+ 3. Provide the corresponding Gurobi Python code to implement the model.
26
+
27
+ Implementation guardrails:
28
+ - Use `gurobipy` exclusively (avoid cvxpy/pulp/copty imports).
29
+ - When indexing tupledict variables across periods, introduce an explicit sentinel index (e.g., period 0) for initial conditions instead of accessing undefined keys like `x[-1]`.
30
+ - Define any Big-M constants explicitly using bounds derived from the data before they appear in constraints.
31
+ - Keep the model linear/integer; if a relationship seems non-linear, introduce auxiliary variables and linearization rather than exponentiation or log constraints.
32
+ - Always ensure every symbol referenced in constraints/objective (such as `M`, helper dictionaries, etc.) is declared in the code block.
33
+ """,
34
+ "user": """Problem:
35
+ {question}
36
+
37
+ Provide a complete solution with mathematical model and Gurobi code.
38
+ """
39
+ }
40
+
41
+ # ============================================
42
+ # Model Configuration
43
+ # ============================================
44
+
45
+ # Supported models and their default temperatures
46
+ MODEL_CONFIGS = {
47
+ "gpt-4o": {"temperature": 0.01, "max_tokens": 8192},
48
+ "gpt-4o-mini": {"temperature": 0.01, "max_tokens": 8192},
49
+ "deepseek-chat": {"temperature": 0.01, "max_tokens": 8192},
50
+ "gemini-2.0-flash-exp": {"temperature": 0.01, "max_tokens": 8192},
51
+ "gemini-2.5-pro": {"temperature": 0.01, "max_tokens": 8192},
52
+ }
53
+
54
+ # ============================================
55
+ # Evaluation Configuration
56
+ # ============================================
57
+
58
+ EVAL_CONFIG = {
59
+ # Execution settings
60
+ "timeout": 60, # seconds
61
+ "max_retries": 3,
62
+
63
+ # Answer comparison settings
64
+ "tolerance": 0.05, # 5% relative tolerance by default
65
+ "use_relative_tolerance": True,
66
+ "absolute_tolerance": 1e-3, # for zero objective values
67
+
68
+ # Output settings
69
+ "save_code": True,
70
+ "save_output": False, # whether to save stdout/stderr
71
+ "verbose": False,
72
+ }
73
+
74
+ # ============================================
75
+ # Dataset Configuration
76
+ # ============================================
77
+
78
+ # Supported datasets
79
+ DATASETS = [
80
+ "ComplexLP",
81
+ "EasyLP",
82
+ "IndustryOR",
83
+ "NL4OPT",
84
+ "NLP4LP",
85
+ "ReSocratic",
86
+ "ComplexOR",
87
+ "OPT-Principled",
88
+ ]
89
+
90
+ DATASET_ALIASES = {
91
+ "complexlp_clean": "ComplexLP",
92
+ "easylp_clean": "EasyLP",
93
+ "industryor_clean": "IndustryOR",
94
+ "industryor_v2": "IndustryOR",
95
+ "industryor_fixedv2": "IndustryOR",
96
+ "industryor_fixedv2_clean": "IndustryOR",
97
+ "nl4opt": "NL4OPT",
98
+ "nl4opt_clean": "NL4OPT",
99
+ "nlp4lp_clean": "NLP4LP",
100
+ "complexor_clean": "ComplexOR",
101
+ "resocratic_clean": "ReSocratic",
102
+ "combined": "OPT-Principled",
103
+ "combined_dataset": "OPT-Principled",
104
+ "opt-principled_clean": "OPT-Principled",
105
+ }
106
+
107
+ # Dataset-specific settings (if needed)
108
+ DATASET_CONFIG = {
109
+ "ComplexLP": {"tolerance": 0.05},
110
+ "EasyLP": {"tolerance": 0.01},
111
+ "IndustryOR": {"tolerance": 0.05},
112
+ "OPT-Principled": {"tolerance": 0.05},
113
+ }
114
+
115
+ # ============================================
116
+ # Utility Functions
117
+ # ============================================
118
+
119
+ def get_prompt_template(template_name="default"):
120
+ """Get prompt template by name"""
121
+ templates = {
122
+ "default": GUROBI_PROMPT,
123
+ }
124
+ return templates.get(template_name, GUROBI_PROMPT)
125
+
126
+
127
+ def get_model_config(model_name):
128
+ """Get configuration for a specific model"""
129
+ return MODEL_CONFIGS.get(model_name, {"temperature": 0.01, "max_tokens": 8192})
130
+
131
+
132
+ def get_dataset_config(dataset_name):
133
+ """Get configuration for a specific dataset"""
134
+ return DATASET_CONFIG.get(normalize_dataset_name(dataset_name), {"tolerance": 0.05})
135
+
136
+
137
+ def normalize_dataset_name(dataset_name: str) -> str:
138
+ """Map historical dataset names to the canonical OPEN benchmark names."""
139
+ if not dataset_name:
140
+ return dataset_name
141
+
142
+ name = dataset_name.strip()
143
+ if name.endswith(".jsonl"):
144
+ name = name[:-6]
145
+
146
+ alias = DATASET_ALIASES.get(name.casefold())
147
+ if alias:
148
+ return alias
149
+
150
+ for canonical_name in DATASETS:
151
+ if canonical_name.casefold() == name.casefold():
152
+ return canonical_name
153
+
154
+ if name.endswith("_clean"):
155
+ base_name = name[:-6]
156
+ for canonical_name in DATASETS:
157
+ if canonical_name.casefold() == base_name.casefold():
158
+ return canonical_name
159
+
160
+ return name
161
+
162
+
163
+ def get_benchmark_dirs(project_root: Path) -> list[Path]:
164
+ """Return benchmark directories in priority order for the migrated OPEN layout."""
165
+ return [
166
+ project_root.parent.parent / "data" / "benchmarks",
167
+ project_root / "clean_benchmarks",
168
+ project_root.parent / "clean_benchmarks",
169
+ ]
170
+
171
+
172
+ def find_benchmark_path(project_root: Path, dataset_name: str) -> Path:
173
+ """Locate the benchmark file for a dataset, accepting legacy names as input."""
174
+ normalized_name = normalize_dataset_name(dataset_name)
175
+ candidate_names = [normalized_name]
176
+ raw_name = dataset_name[:-6] if dataset_name.endswith(".jsonl") else dataset_name
177
+ if raw_name not in candidate_names:
178
+ candidate_names.append(raw_name)
179
+
180
+ for directory in get_benchmark_dirs(project_root):
181
+ for name in candidate_names:
182
+ candidate = directory / f"{name}.jsonl"
183
+ if candidate.exists():
184
+ return candidate
185
+
186
+ raise FileNotFoundError(
187
+ f"Dataset '{dataset_name}' not found. Checked directories: "
188
+ f"{[str(path) for path in get_benchmark_dirs(project_root)]}"
189
+ )
src/debate_memory/debate_memory_builder.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Build a debate-specific memory bank from historical debate runs.
3
+
4
+ This scans existing debate result directories such as
5
+ `./results/Agora-Opt/debate/<dataset>/<timestamp>_<modelA>_vs_<modelB>/`
6
+ directories, identifies problems where the two single generations disagreed yet
7
+ the debate converged to a correct consensus, summarizes the key reconciliation
8
+ insights (optionally via an LLM), and stores the cases inside a dedicated
9
+ `MemoryBank` directory (default: ./debate_memory_storage).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import math
17
+ import os
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from dataclasses import dataclass
20
+ from pathlib import Path
21
+ from typing import Dict, Iterable, List, Optional, Tuple
22
+
23
+ from tqdm import tqdm
24
+
25
+ from .llm import get_response
26
+ from .memory_bank import MemoryBank
27
+
28
+ PKG_DIR = Path(__file__).resolve().parent
29
+ PROJECT_ROOT = PKG_DIR.parent.parent
30
+ DEFAULT_RUNS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt" / "debate"
31
+ DEFAULT_DEBATE_MEMORY_DIR = PROJECT_ROOT / "debate_memory_storage"
32
+
33
+
34
+ @dataclass
35
+ class DebateCaseInput:
36
+ dataset: str
37
+ problem_id: int
38
+ description: str
39
+ final_code: str
40
+ final_result: Optional[float]
41
+ debate_rounds: List[Dict]
42
+ modelA: str
43
+ modelB: str
44
+ run_dir: Path
45
+ ground_truth: Optional[str]
46
+ initial_A_result: Optional[float]
47
+ initial_B_result: Optional[float]
48
+ evaluation: Dict
49
+ metadata: Dict
50
+
51
+
52
+ def load_jsonl(path: Path) -> List[Dict]:
53
+ if not path.exists():
54
+ return []
55
+ data: List[Dict] = []
56
+ with path.open("r", encoding="utf-8") as fh:
57
+ for line in fh:
58
+ line = line.strip()
59
+ if not line:
60
+ continue
61
+ try:
62
+ data.append(json.loads(line))
63
+ except json.JSONDecodeError:
64
+ continue
65
+ return data
66
+
67
+
68
+ def float_or_none(value) -> Optional[float]:
69
+ if value is None:
70
+ return None
71
+ try:
72
+ return float(value)
73
+ except (ValueError, TypeError):
74
+ return None
75
+
76
+
77
+ def has_disagreement(entry: Dict, tolerance: float) -> bool:
78
+ a = float_or_none(entry.get("initial_A_result"))
79
+ b = float_or_none(entry.get("initial_B_result"))
80
+ if a is None or b is None:
81
+ return True
82
+ return abs(a - b) > tolerance
83
+
84
+
85
+ def summarize_rounds(rounds: List[Dict], max_chars: int = 2000) -> str:
86
+ if not rounds:
87
+ return ""
88
+ lines: List[str] = []
89
+ for rnd in rounds:
90
+ round_idx = rnd.get("round")
91
+ res_a = rnd.get("result_A")
92
+ res_b = rnd.get("result_B")
93
+ status_a = rnd.get("status_A")
94
+ status_b = rnd.get("status_B")
95
+ analysis_a = (rnd.get("analysis_A") or "").strip()
96
+ analysis_b = (rnd.get("analysis_B") or "").strip()
97
+ lines.append(
98
+ f"Round {round_idx}: A={res_a} ({status_a}), B={res_b} ({status_b})"
99
+ )
100
+ if analysis_a:
101
+ lines.append(f"Model A analysis:\n{analysis_a}")
102
+ if analysis_b:
103
+ lines.append(f"Model B analysis:\n{analysis_b}")
104
+ lines.append("")
105
+ text = "\n".join(lines).strip()
106
+ if len(text) <= max_chars:
107
+ return text
108
+ return text[: max_chars - 200] + "\n...\n(truncated)"
109
+
110
+
111
+ def build_summary_payload(
112
+ case: DebateCaseInput,
113
+ llm_model: Optional[str],
114
+ temperature: float,
115
+ llm_attempts: int = 1,
116
+ ) -> Dict:
117
+ history_text = summarize_rounds(case.debate_rounds)
118
+ default_summary = {
119
+ "summary": (
120
+ f"Initial mismatch: modelA={case.initial_A_result}, "
121
+ f"modelB={case.initial_B_result}. "
122
+ f"Debate converged in {len(case.debate_rounds)} rounds."
123
+ ),
124
+ "mismatch_reason": "",
125
+ "decisive_argument": "",
126
+ "guardrails": [],
127
+ "modeling_patterns": [],
128
+ }
129
+ if not llm_model:
130
+ return default_summary | {"history_excerpt": history_text}
131
+
132
+ prompt = f"""
133
+ You are helping an optimisation-debate memory builder.
134
+
135
+ Problem description:
136
+ {case.description}
137
+
138
+ Initial disagreement:
139
+ - Model A result: {case.initial_A_result}
140
+ - Model B result: {case.initial_B_result}
141
+ - Ground truth (if known): {case.ground_truth}
142
+
143
+ Debate transcript:
144
+ {history_text}
145
+
146
+ Final consensus objective: {case.final_result}
147
+
148
+ Please return a JSON object with the following keys:
149
+ - "summary": 2-3 sentences explaining how the debate resolved the mismatch.
150
+ - "mismatch_reason": concise reason for the disagreement.
151
+ - "decisive_argument": specific insight that convinced both sides.
152
+ - "guardrails": list of actionable bullet points the next debater should follow.
153
+ - "modeling_patterns": list of reusable modeling tricks/structures that appeared.
154
+
155
+ JSON ONLY. No prose outside the JSON.
156
+ """.strip()
157
+
158
+ attempts_remaining = max(1, llm_attempts)
159
+ last_error: Optional[Exception] = None
160
+ while attempts_remaining > 0:
161
+ try:
162
+ response = get_response(
163
+ prompt,
164
+ model=llm_model,
165
+ temperature=temperature,
166
+ maximum_retries=1,
167
+ )
168
+ payload = json.loads(response)
169
+ payload["history_excerpt"] = history_text
170
+ return payload
171
+ except Exception as exc: # noqa: BLE001
172
+ last_error = exc
173
+ attempts_remaining -= 1
174
+
175
+ fallback = default_summary.copy()
176
+ failure_reason = f"{last_error}" if last_error else "LLM call failed"
177
+ fallback["summary"] += f" LLM summary failed: {failure_reason}"
178
+ fallback["history_excerpt"] = history_text
179
+ return fallback
180
+
181
+
182
+ def existing_signatures(memory_dir: Path) -> set[str]:
183
+ cases_path = memory_dir / "cases.jsonl"
184
+ if not cases_path.exists():
185
+ return set()
186
+ signs: set[str] = set()
187
+ with cases_path.open("r", encoding="utf-8") as fh:
188
+ for line in fh:
189
+ line = line.strip()
190
+ if not line:
191
+ continue
192
+ try:
193
+ data = json.loads(line)
194
+ except json.JSONDecodeError:
195
+ continue
196
+ meta = data.get("metadata") or {}
197
+ sig = meta.get("debate_signature")
198
+ if sig:
199
+ signs.add(sig)
200
+ return signs
201
+
202
+
203
+ class DebateMemoryBuilder:
204
+ def __init__(
205
+ self,
206
+ runs_root: Path,
207
+ memory_dir: Path,
208
+ mismatch_tolerance: float,
209
+ llm_model: Optional[str],
210
+ temperature: float,
211
+ llm_attempts: int,
212
+ max_workers: int,
213
+ datasets: Optional[Iterable[str]] = None,
214
+ dry_run: bool = False,
215
+ ) -> None:
216
+ self.runs_root = runs_root
217
+ self.memory_dir = memory_dir
218
+ self.mismatch_tolerance = mismatch_tolerance
219
+ self.llm_model = llm_model
220
+ self.temperature = temperature
221
+ self.llm_attempts = max(1, llm_attempts)
222
+ self.max_workers = max_workers
223
+ self.datasets_filter = {d.lower() for d in datasets} if datasets else None
224
+ self.dry_run = dry_run
225
+
226
+ def build(self) -> None:
227
+ candidates = self._collect_candidates()
228
+ if not candidates:
229
+ print("No qualifying debate cases found.")
230
+ return
231
+
232
+ if not self.memory_dir.exists() and not self.dry_run:
233
+ self.memory_dir.mkdir(parents=True, exist_ok=True)
234
+
235
+ seen_sigs = existing_signatures(self.memory_dir)
236
+
237
+ bank = None if self.dry_run else MemoryBank(memory_dir=str(self.memory_dir))
238
+
239
+ added = 0
240
+ skipped_duplicates = 0
241
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
242
+ futures = {
243
+ executor.submit(self._summarize_case, case): case
244
+ for case in candidates
245
+ }
246
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Summarizing"):
247
+ case = futures[future]
248
+ signature = f"{case.dataset}:{case.problem_id}:{case.run_dir.name}"
249
+ if signature in seen_sigs:
250
+ skipped_duplicates += 1
251
+ continue
252
+ summary_payload = future.result()
253
+ description = (
254
+ f"{case.description.strip()}\n\n"
255
+ f"# Debate Memory Summary\n"
256
+ f"{summary_payload.get('summary', '').strip()}"
257
+ ).strip()
258
+ metadata = {
259
+ "source": "debate_memory_builder",
260
+ "dataset": case.dataset,
261
+ "run_dir": str(case.run_dir),
262
+ "modelA": case.modelA,
263
+ "modelB": case.modelB,
264
+ "initial_A_result": case.initial_A_result,
265
+ "initial_B_result": case.initial_B_result,
266
+ "ground_truth": case.ground_truth,
267
+ "debate_signature": signature,
268
+ "summary": summary_payload,
269
+ }
270
+ if self.dry_run:
271
+ added += 1
272
+ continue
273
+ try:
274
+ bank.add_case(
275
+ problem_id=case.problem_id,
276
+ problem_desc=description,
277
+ solution_code=case.final_code,
278
+ objective_value=case.final_result or 0.0,
279
+ is_correct=True,
280
+ metadata=metadata,
281
+ )
282
+ added += 1
283
+ seen_sigs.add(signature)
284
+ except Exception as exc: # noqa: BLE001
285
+ print(f"Failed to add case {signature}: {exc}")
286
+
287
+ print("===== Debate Memory Builder Summary =====")
288
+ print(f"Runs root: {self.runs_root}")
289
+ print(f"Output dir: {self.memory_dir}")
290
+ print(f"Total candidates: {len(candidates)}")
291
+ print(f"Added cases: {added}")
292
+ print(f"Duplicates skipped: {skipped_duplicates}")
293
+ if self.dry_run:
294
+ print("Dry-run mode: no cases were written.")
295
+
296
+ def _collect_candidates(self) -> List[DebateCaseInput]:
297
+ candidates: List[DebateCaseInput] = []
298
+ if not self.runs_root.exists():
299
+ print(f"Runs root not found: {self.runs_root}")
300
+ return candidates
301
+
302
+ for dataset_dir in sorted(self.runs_root.iterdir()):
303
+ if not dataset_dir.is_dir():
304
+ continue
305
+ dataset_name = dataset_dir.name
306
+ if self.datasets_filter and dataset_name.lower() not in self.datasets_filter:
307
+ continue
308
+ for run_dir in sorted(dataset_dir.iterdir()):
309
+ if not run_dir.is_dir():
310
+ continue
311
+ dataset_candidates = self._parse_run(dataset_name, run_dir)
312
+ candidates.extend(dataset_candidates)
313
+ return candidates
314
+
315
+ def _parse_run(self, dataset: str, run_dir: Path) -> List[DebateCaseInput]:
316
+ results_path = run_dir / "debate_results.jsonl"
317
+ if not results_path.exists():
318
+ return []
319
+
320
+ modelA, modelB = self._infer_models(run_dir.name)
321
+ consensus_path = next(run_dir.glob("consensus_*_vs_*.jsonl"), None)
322
+ consensus_records = load_jsonl(consensus_path) if consensus_path else []
323
+ desc_map = {int(rec["id"]): rec for rec in consensus_records if "id" in rec}
324
+
325
+ eval_path = run_dir / "eval_consensus" / "evaluation_results.jsonl"
326
+ evaluation_map = {
327
+ int(rec["id"]): rec for rec in load_jsonl(eval_path) if "id" in rec
328
+ }
329
+
330
+ run_candidates: List[DebateCaseInput] = []
331
+ for entry in load_jsonl(results_path):
332
+ problem_id = entry.get("problem_id")
333
+ if problem_id is None:
334
+ continue
335
+ problem_id = int(problem_id)
336
+ if not has_disagreement(entry, self.mismatch_tolerance):
337
+ continue
338
+ if not entry.get("converged"):
339
+ continue
340
+ evaluation = evaluation_map.get(problem_id)
341
+ desc_entry = desc_map.get(problem_id)
342
+ if desc_entry:
343
+ description = desc_entry.get("description") or f"{dataset} problem {problem_id}"
344
+ else:
345
+ description = f"Dataset {dataset} problem {problem_id}"
346
+ final_code = entry.get("final_code") or (
347
+ desc_entry.get("generated_code", "") if desc_entry else ""
348
+ )
349
+ if not final_code:
350
+ continue
351
+ debate_rounds = entry.get("debate_rounds") or []
352
+ if not debate_rounds:
353
+ continue
354
+ run_candidates.append(
355
+ DebateCaseInput(
356
+ dataset=dataset,
357
+ problem_id=problem_id,
358
+ description=description,
359
+ final_code=final_code,
360
+ final_result=float_or_none(entry.get("final_result")),
361
+ debate_rounds=debate_rounds,
362
+ modelA=modelA,
363
+ modelB=modelB,
364
+ run_dir=run_dir,
365
+ ground_truth=entry.get("ground_truth"),
366
+ initial_A_result=float_or_none(entry.get("initial_A_result")),
367
+ initial_B_result=float_or_none(entry.get("initial_B_result")),
368
+ evaluation=evaluation or {},
369
+ metadata={
370
+ "run_dir": str(run_dir),
371
+ "dataset": dataset,
372
+ },
373
+ )
374
+ )
375
+ return run_candidates
376
+
377
+ @staticmethod
378
+ def _infer_models(run_name: str) -> Tuple[str, str]:
379
+ """
380
+ Run folder format: <timestamp>_<modelA>_vs_<modelB>
381
+ """
382
+ parts = run_name.split("_vs_")
383
+ if len(parts) != 2:
384
+ return "modelA", "modelB"
385
+ left = parts[0].split("_") # timestamp + modelA pieces
386
+ if len(left) < 2:
387
+ return left[-1], parts[1]
388
+ modelA = "_".join(left[1:])
389
+ modelB = parts[1]
390
+ return modelA, modelB
391
+
392
+ def _summarize_case(self, case: DebateCaseInput) -> Dict:
393
+ return build_summary_payload(
394
+ case,
395
+ llm_model=self.llm_model,
396
+ temperature=self.temperature,
397
+ llm_attempts=self.llm_attempts,
398
+ )
399
+
400
+
401
+ def parse_args():
402
+ parser = argparse.ArgumentParser(description="Build debate memory bank from historical runs.")
403
+ parser.add_argument(
404
+ "--runs_root",
405
+ type=str,
406
+ default=str(DEFAULT_RUNS_ROOT),
407
+ help="Directory containing debate run artifacts.",
408
+ )
409
+ parser.add_argument(
410
+ "--output_dir",
411
+ type=str,
412
+ default=str(DEFAULT_DEBATE_MEMORY_DIR),
413
+ help="Directory to store the debate memory bank.",
414
+ )
415
+ parser.add_argument(
416
+ "--datasets",
417
+ type=str,
418
+ nargs="*",
419
+ default=None,
420
+ help="Optional dataset filters (case-insensitive).",
421
+ )
422
+ parser.add_argument(
423
+ "--mismatch_tolerance",
424
+ type=float,
425
+ default=1e-3,
426
+ help="Minimum absolute difference between initial results to consider a disagreement.",
427
+ )
428
+ parser.add_argument(
429
+ "--llm_model",
430
+ type=str,
431
+ default=None,
432
+ help="Optional model name for LLM-based summaries. If omitted, heuristic summaries are used.",
433
+ )
434
+ parser.add_argument(
435
+ "--temperature",
436
+ type=float,
437
+ default=0.3,
438
+ help="Temperature for LLM summaries.",
439
+ )
440
+ parser.add_argument(
441
+ "--max_workers",
442
+ type=int,
443
+ default=4,
444
+ help="Parallel workers for summary generation.",
445
+ )
446
+ parser.add_argument(
447
+ "--llm_attempts",
448
+ type=int,
449
+ default=2,
450
+ help="Number of LLM attempts per case before falling back to heuristics.",
451
+ )
452
+ parser.add_argument(
453
+ "--dry_run",
454
+ action="store_true",
455
+ help="Run the pipeline without writing to the memory bank.",
456
+ )
457
+ return parser.parse_args()
458
+
459
+
460
+ def main():
461
+ args = parse_args()
462
+ builder = DebateMemoryBuilder(
463
+ runs_root=Path(args.runs_root),
464
+ memory_dir=Path(args.output_dir),
465
+ mismatch_tolerance=args.mismatch_tolerance,
466
+ llm_model=args.llm_model,
467
+ temperature=args.temperature,
468
+ llm_attempts=args.llm_attempts,
469
+ max_workers=args.max_workers,
470
+ datasets=args.datasets,
471
+ dry_run=args.dry_run,
472
+ )
473
+ builder.build()
474
+
475
+
476
+ if __name__ == "__main__":
477
+ main()
src/debate_memory/debug_executor.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Execute generated Python code and capture basic diagnostics."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import os
7
+ import re
8
+ import subprocess
9
+ import sys
10
+ from dataclasses import dataclass
11
+ from typing import Optional
12
+
13
+
14
+ AUTO_OBJECTIVE_SNIPPET = """
15
+ # Auto-added snippet: attempt to print the objective value for downstream evaluation.
16
+ try:
17
+ candidate = None
18
+ for name in ("model", "m", "Model"):
19
+ if name in globals():
20
+ candidate = globals()[name]
21
+ break
22
+ if candidate is not None and hasattr(candidate, "objVal"):
23
+ print(f"OBJECTIVE_VALUE: {candidate.objVal}")
24
+ except Exception:
25
+ pass
26
+ """.strip()
27
+
28
+
29
+ @dataclass
30
+ class ExecutionResult:
31
+ status: str
32
+ stdout: str
33
+ stderr: str
34
+ objective_value: Optional[float]
35
+ returncode: Optional[int]
36
+ code_path: Optional[str]
37
+
38
+
39
+ def _ensure_directory(path: str) -> None:
40
+ os.makedirs(path, exist_ok=True)
41
+
42
+
43
+ def _append_objective_snippet(code: str) -> str:
44
+ if "OBJECTIVE_VALUE" in code:
45
+ return code if code.endswith("\n") else code + "\n"
46
+ return f"{code.rstrip()}\n\n{AUTO_OBJECTIVE_SNIPPET}\n"
47
+
48
+
49
+ def _normalize_output(value: object) -> str:
50
+ if value is None:
51
+ return ""
52
+ if isinstance(value, bytes):
53
+ return value.decode("utf-8", errors="replace")
54
+ return str(value)
55
+
56
+
57
+ def _extract_objective_value(output: str) -> Optional[float]:
58
+ if not output:
59
+ return None
60
+ patterns = [
61
+ r"OBJECTIVE_VALUE:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
62
+ r"Optimal\s+[Oo]bjective[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
63
+ r"Obj:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
64
+ r"Objective\s+value:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
65
+ ]
66
+ for pattern in patterns:
67
+ match = re.search(pattern, output, re.IGNORECASE)
68
+ if not match:
69
+ continue
70
+ try:
71
+ return float(match.group(1))
72
+ except ValueError:
73
+ continue
74
+ return None
75
+
76
+
77
+ def execute_generated_code(
78
+ code: str,
79
+ problem_id: int,
80
+ output_dir: str,
81
+ timeout: int = 120,
82
+ ) -> ExecutionResult:
83
+ """Write code to disk, execute it, and capture the outcome."""
84
+ code_dir = os.path.join(output_dir, "code")
85
+ _ensure_directory(code_dir)
86
+
87
+ code_with_snippet = _append_objective_snippet(code)
88
+ code_file = os.path.join(code_dir, f"problem_{problem_id}.py")
89
+ with open(code_file, "w", encoding="utf-8") as fh:
90
+ fh.write(code_with_snippet)
91
+
92
+ try:
93
+ completed = subprocess.run(
94
+ [sys.executable, os.path.basename(code_file)],
95
+ cwd=code_dir,
96
+ capture_output=True,
97
+ text=True,
98
+ timeout=timeout,
99
+ )
100
+ except subprocess.TimeoutExpired as exc:
101
+ return ExecutionResult(
102
+ status="timeout",
103
+ stdout=_normalize_output(exc.stdout),
104
+ stderr=f"Execution timeout after {timeout} seconds",
105
+ objective_value=None,
106
+ returncode=None,
107
+ code_path=code_file,
108
+ )
109
+ except Exception as exc: # pragma: no cover - defensive
110
+ return ExecutionResult(
111
+ status="error",
112
+ stdout="",
113
+ stderr=str(exc),
114
+ objective_value=None,
115
+ returncode=None,
116
+ code_path=code_file,
117
+ )
118
+
119
+ stdout = _normalize_output(completed.stdout)
120
+ stderr = _normalize_output(completed.stderr)
121
+ returncode = completed.returncode
122
+
123
+ status = "success" if returncode == 0 else "execution_error"
124
+ objective_value = _extract_objective_value(stdout) if status == "success" else None
125
+
126
+ return ExecutionResult(
127
+ status=status,
128
+ stdout=stdout,
129
+ stderr=stderr,
130
+ objective_value=objective_value,
131
+ returncode=returncode,
132
+ code_path=code_file,
133
+ )
134
+
135
+
136
+ __all__ = ["ExecutionResult", "execute_generated_code"]
src/debate_memory/debug_memory.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Lightweight persistence for debugging experiences."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import hashlib
7
+ import json
8
+ import threading
9
+ from dataclasses import dataclass, asdict
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+ from typing import Any, Dict, Iterable, List, Optional
13
+
14
+
15
+ def _now_iso() -> str:
16
+ return datetime.now(timezone.utc).isoformat()
17
+
18
+
19
+ def _normalise_error(text: str) -> str:
20
+ return (text or "").strip()
21
+
22
+
23
+ @dataclass
24
+ class DebugRecord:
25
+ """Single debugging observation stored on disk."""
26
+
27
+ signature: str
28
+ status: str
29
+ error_text: str
30
+ guidance: str
31
+ problem_id: Optional[int]
32
+ description: str
33
+ metadata: Dict[str, Any]
34
+ timestamp: str
35
+
36
+ def to_dict(self) -> Dict[str, Any]:
37
+ return asdict(self)
38
+
39
+
40
+ _PKG_DIR = Path(__file__).resolve().parent
41
+ _PROJECT_ROOT = _PKG_DIR.parent.parent
42
+
43
+
44
+ class DebugMemoryStore:
45
+ """Append-only store keyed by error signature."""
46
+
47
+ DEFAULT_PATH = _PROJECT_ROOT / "memory_storage" / "debug_memory.jsonl"
48
+
49
+ def __init__(self, path: Optional[str] = None):
50
+ self.path = Path(path) if path else self.DEFAULT_PATH
51
+ self.path.parent.mkdir(parents=True, exist_ok=True)
52
+ if not self.path.exists():
53
+ self.path.touch()
54
+ self._lock = threading.Lock()
55
+
56
+ @staticmethod
57
+ def _signature_from_error(error_text: str, status: str) -> str:
58
+ basis = _normalise_error(error_text)
59
+ if not basis:
60
+ basis = status or "unknown"
61
+ digest = hashlib.sha1(basis.encode("utf-8")).hexdigest()[:12]
62
+ return digest
63
+
64
+ def _append(self, record: DebugRecord) -> None:
65
+ payload = json.dumps(record.to_dict(), ensure_ascii=False)
66
+ with self._lock, self.path.open("a", encoding="utf-8") as fh:
67
+ fh.write(payload + "\n")
68
+
69
+ def record_execution_feedback(
70
+ self,
71
+ *,
72
+ problem_id: Optional[int],
73
+ description: str,
74
+ status: str,
75
+ error_text: str,
76
+ guidance: str,
77
+ source: str,
78
+ metadata: Optional[Dict[str, Any]] = None,
79
+ ) -> str:
80
+ """Persist execution feedback and return the signature used."""
81
+ signature_core = self._signature_from_error(error_text, status)
82
+ signature = f"exec:{signature_core}"
83
+ record = DebugRecord(
84
+ signature=signature,
85
+ status=status or "unknown",
86
+ error_text=_normalise_error(error_text) or status or "",
87
+ guidance=(guidance or "").strip(),
88
+ problem_id=problem_id,
89
+ description=(description or "").strip(),
90
+ metadata={
91
+ "source": source,
92
+ **(metadata or {}),
93
+ },
94
+ timestamp=_now_iso(),
95
+ )
96
+ self._append(record)
97
+ return signature
98
+
99
+ def record_validation_feedback(
100
+ self,
101
+ *,
102
+ problem_id: Optional[int],
103
+ issues: Iterable[str],
104
+ metadata: Optional[Dict[str, Any]] = None,
105
+ source: str = "validation",
106
+ ) -> List[str]:
107
+ """Persist validation feedback items and return the signatures used."""
108
+ signatures: List[str] = []
109
+ for issue in issues:
110
+ if not issue:
111
+ continue
112
+ signature_core = self._signature_from_error(issue, "validation")
113
+ signature = f"validation:{signature_core}"
114
+ record = DebugRecord(
115
+ signature=signature,
116
+ status="validation",
117
+ error_text=_normalise_error(issue),
118
+ guidance="",
119
+ problem_id=problem_id,
120
+ description="",
121
+ metadata={
122
+ "source": source,
123
+ **(metadata or {}),
124
+ },
125
+ timestamp=_now_iso(),
126
+ )
127
+ self._append(record)
128
+ signatures.append(signature)
129
+ return signatures
130
+
131
+ def retrieve_for_problem(self, problem_id: int, limit: int = 3) -> List[DebugRecord]:
132
+ """Return recent records for a given problem id (best-effort)."""
133
+ if problem_id is None:
134
+ return []
135
+ matches: List[DebugRecord] = []
136
+ with self.path.open("r", encoding="utf-8") as fh:
137
+ for line in fh:
138
+ line = line.strip()
139
+ if not line:
140
+ continue
141
+ try:
142
+ payload = json.loads(line)
143
+ except json.JSONDecodeError:
144
+ continue
145
+ if payload.get("problem_id") != problem_id:
146
+ continue
147
+ matches.append(
148
+ DebugRecord(
149
+ signature=payload.get("signature", ""),
150
+ status=payload.get("status", ""),
151
+ error_text=payload.get("error_text", ""),
152
+ guidance=payload.get("guidance", ""),
153
+ problem_id=payload.get("problem_id"),
154
+ description=payload.get("description", ""),
155
+ metadata=payload.get("metadata", {}) or {},
156
+ timestamp=payload.get("timestamp", ""),
157
+ )
158
+ )
159
+ matches.sort(key=lambda item: item.timestamp, reverse=True)
160
+ return matches[:limit] if limit else matches
161
+
162
+
163
+ __all__ = ["DebugMemoryStore", "DebugRecord"]
src/debate_memory/debug_memory_builder.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert debug_memory.jsonl records into a searchable MemoryBank."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import glob
7
+ import hashlib
8
+ import json
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ from .memory_bank import MemoryBank
14
+
15
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
16
+ LEGACY_ROOT = PROJECT_ROOT.parent / "debate_with_memory"
17
+
18
+
19
+ def _default_inputs() -> List[str]:
20
+ candidates = [
21
+ PROJECT_ROOT / "memory_storage" / "debug_memory.jsonl",
22
+ LEGACY_ROOT / "memory_storage" / "debug_memory.jsonl",
23
+ PROJECT_ROOT / "memory_storage" / "backups" / "*" / "debug_memory.jsonl",
24
+ LEGACY_ROOT / "memory_storage" / "backups" / "*" / "debug_memory.jsonl",
25
+ ]
26
+ return [str(path) for path in candidates]
27
+
28
+
29
+ def _stable_id(signature: str) -> int:
30
+ digest = hashlib.sha1(signature.encode("utf-8")).hexdigest()
31
+ return int(digest[:12], 16)
32
+
33
+
34
+ def _parse_timestamp(ts: Optional[str]) -> datetime:
35
+ if not ts:
36
+ return datetime.min
37
+ try:
38
+ return datetime.fromisoformat(ts)
39
+ except ValueError:
40
+ return datetime.min
41
+
42
+
43
+ def load_debug_records(input_globs: List[str]) -> Dict[str, Dict]:
44
+ records: Dict[str, Dict] = {}
45
+ files: List[str] = []
46
+ for pattern in input_globs:
47
+ files.extend(glob.glob(pattern))
48
+ files = sorted({Path(f) for f in files if Path(f).exists()})
49
+ for file_path in files:
50
+ with file_path.open("r", encoding="utf-8") as fh:
51
+ for line in fh:
52
+ line = line.strip()
53
+ if not line:
54
+ continue
55
+ try:
56
+ record = json.loads(line)
57
+ except json.JSONDecodeError:
58
+ continue
59
+ signature = record.get("signature")
60
+ if not signature:
61
+ continue
62
+ ts = _parse_timestamp(record.get("timestamp"))
63
+ existing = records.get(signature)
64
+ if existing is None or ts > existing.get("_ts", datetime.min):
65
+ record["_ts"] = ts
66
+ records[signature] = record
67
+ return records
68
+
69
+
70
+ def build_debug_memory(records: Dict[str, Dict], output_dir: Path, clear: bool) -> None:
71
+ if clear and output_dir.exists():
72
+ for child in output_dir.iterdir():
73
+ if child.is_file():
74
+ child.unlink()
75
+ else:
76
+ import shutil
77
+
78
+ shutil.rmtree(child)
79
+ bank = MemoryBank(memory_dir=str(output_dir))
80
+ added = 0
81
+ for signature, record in records.items():
82
+ description = record.get("description", "Unknown problem")
83
+ error_text = record.get("error_text", "")
84
+ guidance = record.get("guidance", "")
85
+ status = record.get("status", "")
86
+ metadata = {
87
+ "signature": signature,
88
+ "status": status,
89
+ "timestamp": record.get("timestamp"),
90
+ **(record.get("metadata") or {}),
91
+ }
92
+ note_lines = ["# Debug Memory Case", f"Signature: {signature}", f"Status: {status}"]
93
+ if guidance:
94
+ note_lines.append(f"Guidance: {guidance}")
95
+ note_lines.append("---")
96
+ if error_text:
97
+ note_lines.append("Error snippet:\n" + error_text)
98
+ note_lines.append("---")
99
+ note_lines.append(f"Source metadata: {metadata}")
100
+ prompt_desc = (
101
+ f"{description}\n\n## Error Details\n```\n{error_text}\n```\n"
102
+ f"## Guidance\n{guidance or 'N/A'}\n"
103
+ )
104
+ problem_id = record.get("problem_id")
105
+ if problem_id is None:
106
+ problem_id = _stable_id(signature)
107
+ try:
108
+ bank.add_case(
109
+ problem_id=int(problem_id),
110
+ problem_desc=prompt_desc,
111
+ solution_code="\n".join(note_lines),
112
+ objective_value=0.0,
113
+ is_correct=True,
114
+ metadata=metadata,
115
+ )
116
+ added += 1
117
+ except Exception as exc: # noqa: BLE001
118
+ print(f"Failed to add debug case {signature}: {exc}")
119
+ print(f"✅ Added {added} debug cases to {output_dir}")
120
+
121
+
122
+ def parse_args():
123
+ parser = argparse.ArgumentParser(description="Build debug memory bank from debug_memory.jsonl records")
124
+ parser.add_argument(
125
+ "--input", nargs="*", default=_default_inputs(), help="Input files/globs containing debug records",
126
+ )
127
+ parser.add_argument(
128
+ "--output_dir",
129
+ type=str,
130
+ default=str(PROJECT_ROOT / "debug_case_memory"),
131
+ help="Where to store the constructed memory bank",
132
+ )
133
+ parser.add_argument(
134
+ "--clear",
135
+ action="store_true",
136
+ help="Remove existing output_dir contents before rebuilding",
137
+ )
138
+ return parser.parse_args()
139
+
140
+
141
+ def main():
142
+ args = parse_args()
143
+ records = load_debug_records(args.input)
144
+ print(f"Loaded {len(records)} unique debug signatures")
145
+ build_debug_memory(records, Path(args.output_dir), clear=args.clear)
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
150
+
src/debate_memory/debug_utils.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Minimal helpers for generated code execution reports."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import os
8
+ from dataclasses import dataclass, asdict
9
+ from typing import List, Optional
10
+
11
+ from .debug_memory import DebugMemoryStore
12
+
13
+
14
+ @dataclass
15
+ class DebugMetadata:
16
+ problem_id: int
17
+ notes: List[str]
18
+
19
+ def to_json(self) -> str:
20
+ return json.dumps(asdict(self), ensure_ascii=False, indent=2)
21
+
22
+
23
+ def sanitize_code(code: str, problem_id: int):
24
+ """Ensure code ends with a newline and capture any lightweight notes."""
25
+ metadata = DebugMetadata(problem_id=problem_id, notes=[])
26
+ cleaned = (code or "").rstrip() + "\n" if code else ""
27
+ return cleaned, metadata
28
+
29
+
30
+ def save_debug_metadata(metadata: DebugMetadata, output_dir: str) -> None:
31
+ """Persist metadata only when there is something noteworthy."""
32
+ if not metadata.notes:
33
+ return
34
+ debug_dir = os.path.join(output_dir, "debug")
35
+ os.makedirs(debug_dir, exist_ok=True)
36
+ path = os.path.join(debug_dir, f"problem_{metadata.problem_id}.json")
37
+ with open(path, "w", encoding="utf-8") as fh:
38
+ fh.write(metadata.to_json())
39
+
40
+
41
+ def write_debug_suggestions(
42
+ problem_id: int,
43
+ description: str,
44
+ error_message: str,
45
+ memory_helper,
46
+ memory_bank,
47
+ output_dir: str,
48
+ *,
49
+ status: str,
50
+ debug_store: Optional[DebugMemoryStore] = None,
51
+ top_k_cases: int = 3,
52
+ ) -> None:
53
+ """Write a straightforward debug report and optionally record the memory."""
54
+ _ = memory_helper, memory_bank, top_k_cases # Unused but kept for interface compatibility.
55
+ debug_dir = os.path.join(output_dir, "debug")
56
+ os.makedirs(debug_dir, exist_ok=True)
57
+ path = os.path.join(debug_dir, f"problem_{problem_id}_suggestions.md")
58
+
59
+ lines: List[str] = [
60
+ f"# Debug Report for Problem {problem_id}",
61
+ "",
62
+ f"- **Status:** {status}",
63
+ ]
64
+ if description:
65
+ lines.extend(["", "## Description", description.strip(), ""])
66
+ if error_message:
67
+ lines.extend(
68
+ [
69
+ "## Error Traceback",
70
+ "```",
71
+ error_message.strip(),
72
+ "```",
73
+ "",
74
+ ]
75
+ )
76
+ else:
77
+ lines.extend(["", "## Error Traceback", "_No traceback captured._", ""])
78
+
79
+ lines.append("## Notes")
80
+ lines.append("")
81
+ lines.append("Automated debugging is not yet implemented. Review the trace above for hints.")
82
+ lines.append("")
83
+
84
+ with open(path, "w", encoding="utf-8") as fh:
85
+ fh.write("\n".join(lines))
86
+
87
+ if debug_store:
88
+ debug_store.record_execution_feedback(
89
+ problem_id=problem_id,
90
+ description=description,
91
+ status=status,
92
+ error_text=error_message or status,
93
+ guidance="Automated debugging is not yet implemented.",
94
+ source="debug_utils.write_debug_suggestions",
95
+ metadata={},
96
+ )
97
+
98
+
99
+ __all__ = ["DebugMetadata", "sanitize_code", "save_debug_metadata", "write_debug_suggestions"]
src/debate_memory/execute.py ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Execute and evaluate generated Gurobi code
3
+ """
4
+
5
+ import argparse
6
+ import json
7
+ import os
8
+ import re
9
+ import subprocess
10
+ import sys
11
+ from collections import defaultdict
12
+ from pathlib import Path
13
+ from typing import Dict, List
14
+ from concurrent.futures import ProcessPoolExecutor, as_completed
15
+ from tqdm import tqdm
16
+
17
+ from .debug_utils import sanitize_code, save_debug_metadata, write_debug_suggestions
18
+
19
+ SCRIPT_DIR = Path(__file__).resolve().parent
20
+ PROJECT_ROOT = SCRIPT_DIR.parent.parent
21
+ DEFAULT_MEMORY_DIR = PROJECT_ROOT / "memory_storage"
22
+ DEFAULT_GUIDELINES = DEFAULT_MEMORY_DIR / "category_guidelines.jsonl"
23
+ DEFAULT_DEBUG_MEMORY = DEFAULT_MEMORY_DIR / "debug_memory.jsonl"
24
+
25
+
26
+ def extract_objective_value(output: str) -> float:
27
+ """
28
+ Extract objective value from Gurobi output
29
+
30
+ Args:
31
+ output: stdout from Gurobi code execution
32
+
33
+ Returns:
34
+ Objective value as float, or None if not found
35
+ """
36
+ if not output or output.strip() == "":
37
+ return None
38
+
39
+ # Common patterns in Gurobi output
40
+ patterns = [
41
+ r'Optimal\s+[Oo]bjective[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
42
+ r'Obj:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
43
+ r'Best\s+objective\s+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
44
+ r'Objective\s+value:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
45
+ r'OBJECTIVE_VALUE:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)', # Our auto-added pattern
46
+ ]
47
+
48
+ for pattern in patterns:
49
+ match = re.search(pattern, output, re.IGNORECASE)
50
+ if match:
51
+ try:
52
+ return float(match.group(1))
53
+ except ValueError:
54
+ continue
55
+
56
+ # Fallback: check common custom labels printed by prompts
57
+ fallback_patterns = [
58
+ r'Total\s+Cost[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
59
+ r'Total\s+Profit[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
60
+ r'Total\s+Net\s+Profit[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
61
+ r'Total\s+Revenue[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
62
+ ]
63
+
64
+ for pattern in fallback_patterns:
65
+ match = re.search(pattern, output, re.IGNORECASE)
66
+ if match:
67
+ try:
68
+ return float(match.group(1))
69
+ except ValueError:
70
+ continue
71
+
72
+ return None
73
+
74
+
75
+ def enhance_code_with_objective_print(code: str) -> str:
76
+ """
77
+ Add objective value printing to code if not already present
78
+
79
+ This helps ensure we can extract the objective value even if
80
+ the generated code doesn't print it explicitly.
81
+
82
+ Note: Always adds a fallback print to handle cases where existing
83
+ prints are conditional (e.g., inside if status == OPTIMAL blocks)
84
+ """
85
+ # Add code to print objective value (always add as a safety measure)
86
+ enhancement_code = """
87
+ # Auto-added: Print objective value for evaluation (fallback)
88
+ try:
89
+ # Try common variable names for Gurobi model
90
+ if 'model' in dir():
91
+ mdl = model
92
+ elif 'm' in dir():
93
+ mdl = m
94
+ elif 'Model' in dir():
95
+ mdl = Model
96
+ else:
97
+ mdl = None
98
+
99
+ # Fallback: scan globals for a likely Gurobi model instance.
100
+ # This helps when the generated code uses a non-standard variable name.
101
+ if mdl is None:
102
+ try:
103
+ for _name, _val in list(globals().items()):
104
+ try:
105
+ if hasattr(_val, 'objVal') and hasattr(_val, 'optimize'):
106
+ mdl = _val
107
+ break
108
+ except Exception:
109
+ continue
110
+ except Exception:
111
+ pass
112
+
113
+ if mdl is not None and hasattr(mdl, 'objVal'):
114
+ try:
115
+ obj_value = mdl.objVal
116
+ print(f"OBJECTIVE_VALUE: {obj_value}")
117
+ except:
118
+ # Model might not have been solved yet
119
+ pass
120
+ except:
121
+ pass
122
+ """
123
+
124
+ return code + "\n" + enhancement_code
125
+
126
+
127
+ def execute_code(code: str, problem_id: int, output_dir: str, timeout: int = 60) -> Dict:
128
+ """
129
+ Execute Gurobi code and capture results
130
+
131
+ Args:
132
+ code: Python code to execute
133
+ problem_id: Problem ID
134
+ output_dir: Directory to save code files
135
+ timeout: Execution timeout in seconds
136
+
137
+ Returns:
138
+ Dictionary with execution results
139
+ """
140
+ # Create output directory
141
+ code_dir = os.path.join(output_dir, 'code')
142
+ os.makedirs(code_dir, exist_ok=True)
143
+
144
+ sanitized_code, debug_meta = sanitize_code(code, problem_id)
145
+ code_enhanced = enhance_code_with_objective_print(sanitized_code)
146
+
147
+ # Save code to file
148
+ code_file = os.path.join(code_dir, f'problem_{problem_id}.py')
149
+ with open(code_file, 'w', encoding='utf-8') as f:
150
+ f.write(code_enhanced)
151
+
152
+ # Persist debug metadata if anything noteworthy was detected
153
+ save_debug_metadata(debug_meta, output_dir)
154
+
155
+ # Execute code
156
+ try:
157
+ result = subprocess.run(
158
+ [sys.executable, f'problem_{problem_id}.py'],
159
+ capture_output=True,
160
+ text=True,
161
+ timeout=timeout,
162
+ cwd=code_dir
163
+ )
164
+
165
+ stdout = result.stdout
166
+ stderr = result.stderr
167
+ returncode = result.returncode
168
+
169
+ if returncode == 0:
170
+ obj_value = extract_objective_value(stdout)
171
+ if obj_value is not None:
172
+ return {
173
+ 'status': 'success',
174
+ 'objective_value': obj_value,
175
+ 'stdout': stdout,
176
+ 'stderr': stderr
177
+ }
178
+ else:
179
+ return {
180
+ 'status': 'success_no_objective',
181
+ 'objective_value': None,
182
+ 'stdout': stdout,
183
+ 'stderr': stderr
184
+ }
185
+ else:
186
+ return {
187
+ 'status': 'execution_error',
188
+ 'objective_value': None,
189
+ 'stdout': stdout,
190
+ 'stderr': stderr,
191
+ 'returncode': returncode
192
+ }
193
+
194
+ except subprocess.TimeoutExpired:
195
+ return {
196
+ 'status': 'timeout',
197
+ 'objective_value': None,
198
+ 'stdout': '',
199
+ 'stderr': f'Execution timeout after {timeout} seconds'
200
+ }
201
+ except Exception as e:
202
+ return {
203
+ 'status': 'error',
204
+ 'objective_value': None,
205
+ 'stdout': '',
206
+ 'stderr': str(e)
207
+ }
208
+
209
+
210
+ def check_correctness(pred_obj: float, gt_obj: float, tolerance: float = 0.05,
211
+ use_relative: bool = True) -> bool:
212
+ """
213
+ Check if predicted objective matches ground truth
214
+
215
+ Args:
216
+ pred_obj: Predicted objective value
217
+ gt_obj: Ground truth objective value
218
+ tolerance: Tolerance for comparison
219
+ use_relative: Use relative tolerance if True, absolute if False
220
+
221
+ Returns:
222
+ True if values match within tolerance
223
+ """
224
+ if pred_obj is None or gt_obj is None:
225
+ return False
226
+
227
+ try:
228
+ pred_obj = float(pred_obj)
229
+ gt_obj = float(gt_obj)
230
+
231
+ if gt_obj == 0:
232
+ return abs(pred_obj) <= tolerance
233
+
234
+ if use_relative:
235
+ return abs((pred_obj - gt_obj) / gt_obj) <= tolerance
236
+ else:
237
+ return abs(pred_obj - gt_obj) <= tolerance
238
+ except (ValueError, TypeError):
239
+ return False
240
+
241
+
242
+ def evaluate_results(results: List[Dict], args) -> Dict:
243
+ """
244
+ Evaluate execution results
245
+
246
+ Args:
247
+ results: List of result dictionaries
248
+ args: Command line arguments
249
+
250
+ Returns:
251
+ Evaluation report dictionary
252
+ """
253
+ total = len(results)
254
+ correct = 0
255
+
256
+ status_counts = defaultdict(int)
257
+ correct_ids = []
258
+ incorrect_details = []
259
+
260
+ for result in results:
261
+ status = result['execution_status']
262
+ status_counts[status] += 1
263
+
264
+ if status == 'success' and result['is_correct']:
265
+ correct += 1
266
+ correct_ids.append(result['id'])
267
+ elif status == 'success' and not result['is_correct']:
268
+ incorrect_details.append({
269
+ 'id': result['id'],
270
+ 'predicted': result['predicted_objective'],
271
+ 'ground_truth': result['ground_truth']
272
+ })
273
+
274
+ accuracy = correct / total if total > 0 else 0.0
275
+
276
+ report = {
277
+ 'total_problems': total,
278
+ 'correct': correct,
279
+ 'accuracy': accuracy,
280
+ 'status_counts': dict(status_counts),
281
+ 'correct_ids': correct_ids,
282
+ 'incorrect_details': incorrect_details[:10], # Save first 10 for reference
283
+ 'settings': {
284
+ 'tolerance': args.tolerance,
285
+ 'use_relative_tolerance': args.use_relative_tolerance,
286
+ 'timeout': args.timeout
287
+ }
288
+ }
289
+
290
+ return report
291
+
292
+
293
+ def process_single_problem(gen_result, args):
294
+ """Process a single problem (for parallel execution)"""
295
+ problem_id = gen_result['id']
296
+ code = gen_result['generated_code']
297
+ gt_answer = gen_result.get('answer')
298
+
299
+ if not code:
300
+ result = {
301
+ 'id': problem_id,
302
+ 'execution_status': 'no_code',
303
+ 'predicted_objective': None,
304
+ 'ground_truth': gt_answer,
305
+ 'is_correct': False
306
+ }
307
+ else:
308
+ exec_result = execute_code(code, problem_id, args.output_dir, args.timeout)
309
+
310
+ pred_obj = exec_result['objective_value']
311
+ is_correct = False
312
+
313
+ if pred_obj is not None and gt_answer is not None:
314
+ try:
315
+ gt_obj = float(gt_answer)
316
+ is_correct = check_correctness(
317
+ pred_obj, gt_obj,
318
+ args.tolerance,
319
+ args.use_relative_tolerance
320
+ )
321
+ except (ValueError, TypeError):
322
+ is_correct = False
323
+
324
+ result = {
325
+ 'id': problem_id,
326
+ 'execution_status': exec_result['status'],
327
+ 'predicted_objective': pred_obj,
328
+ 'ground_truth': gt_answer,
329
+ 'is_correct': is_correct,
330
+ 'stdout': exec_result['stdout'][:500] if args.save_output else '',
331
+ 'stderr': exec_result['stderr'][:500] if args.save_output else ''
332
+ }
333
+
334
+ return result
335
+
336
+
337
+ def main(args):
338
+ # Load generated results
339
+ if not os.path.exists(args.input_file):
340
+ raise FileNotFoundError(f"Input file not found: {args.input_file}")
341
+
342
+ with open(args.input_file, 'r', encoding='utf-8') as f:
343
+ generated_results = [json.loads(line) for line in f if line.strip()]
344
+
345
+ print(f"Loaded {len(generated_results)} generated results")
346
+
347
+ # Create output directory
348
+ os.makedirs(args.output_dir, exist_ok=True)
349
+ id_to_problem = {record['id']: record for record in generated_results}
350
+
351
+ debug_store = None
352
+ memory_helper = None
353
+ memory_bank = None
354
+ if not args.disable_debug_memory:
355
+ try:
356
+ from .debug_memory import DebugMemoryStore
357
+ from .memory_bank import MemoryBank
358
+ from .memory_intelligence import MemoryIntelligence
359
+ except ModuleNotFoundError as exc:
360
+ print(
361
+ f"⚠️ Debug-memory dependencies missing ({exc}). "
362
+ "Continuing with --disable_debug_memory behavior."
363
+ )
364
+ args.disable_debug_memory = True
365
+ else:
366
+ debug_store = DebugMemoryStore(args.debug_memory_path)
367
+ if args.category_guidelines_path:
368
+ try:
369
+ memory_helper = MemoryIntelligence(args.category_guidelines_path)
370
+ except Exception as exc: # noqa: BLE001
371
+ print(f"Warning: failed to load category guidelines ({exc})")
372
+ if args.memory_dir:
373
+ try:
374
+ if args.embedding_model:
375
+ memory_bank = MemoryBank(args.memory_dir, embedding_model=args.embedding_model)
376
+ else:
377
+ memory_bank = MemoryBank(args.memory_dir)
378
+ except Exception as exc: # noqa: BLE001
379
+ print(f"Warning: failed to load memory bank from {args.memory_dir} ({exc})")
380
+
381
+ # Execute and evaluate each result
382
+ evaluation_results = []
383
+
384
+ if args.num_workers > 1:
385
+ # Parallel execution
386
+ print(f"Using {args.num_workers} workers for parallel execution")
387
+ with ProcessPoolExecutor(max_workers=args.num_workers) as executor:
388
+ # Submit all tasks
389
+ future_to_problem = {
390
+ executor.submit(process_single_problem, gen_result, args): gen_result
391
+ for gen_result in generated_results
392
+ }
393
+
394
+ # Collect results with progress bar
395
+ with tqdm(total=len(generated_results), desc="Executing") as pbar:
396
+ for future in as_completed(future_to_problem):
397
+ try:
398
+ result = future.result()
399
+ evaluation_results.append(result)
400
+ status_symbol = '✓' if result['is_correct'] else '✗'
401
+ pbar.set_postfix_str(f"Problem {result['id']}: {status_symbol}")
402
+ pbar.update(1)
403
+ except Exception as e:
404
+ gen_result = future_to_problem[future]
405
+ print(f"\nError processing problem {gen_result['id']}: {e}")
406
+ evaluation_results.append({
407
+ 'id': gen_result['id'],
408
+ 'execution_status': 'error',
409
+ 'predicted_objective': None,
410
+ 'ground_truth': gen_result.get('answer'),
411
+ 'is_correct': False,
412
+ 'stdout': '',
413
+ 'stderr': str(e)
414
+ })
415
+ pbar.update(1)
416
+
417
+ # Sort results by ID to maintain order
418
+ evaluation_results.sort(key=lambda x: x['id'])
419
+ else:
420
+ # Sequential execution (original behavior)
421
+ for gen_result in generated_results:
422
+ problem_id = gen_result['id']
423
+ print(f"Processing problem {problem_id}...", end=' ')
424
+
425
+ result = process_single_problem(gen_result, args)
426
+ evaluation_results.append(result)
427
+
428
+ status_symbol = '✓' if result['is_correct'] else '✗'
429
+ print(f"{status_symbol} [{result['execution_status']}]")
430
+
431
+ # Provide memory-aided suggestions for failures
432
+ if not args.disable_debug_memory:
433
+ for result in evaluation_results:
434
+ status = result['execution_status']
435
+ if status in ('execution_error', 'success_no_objective', 'timeout', 'no_code'):
436
+ gen_result = id_to_problem.get(result['id'], {})
437
+ description = gen_result.get('description', '')
438
+ error_message = result.get('stderr') or result.get('stdout') or ''
439
+ if not error_message:
440
+ if status == 'timeout':
441
+ error_message = 'Execution timeout'
442
+ elif status == 'no_code':
443
+ error_message = 'No code was generated for execution.'
444
+ elif status == 'success_no_objective':
445
+ error_message = 'Execution succeeded but no objective value was captured.'
446
+ write_debug_suggestions(
447
+ problem_id=result['id'],
448
+ description=description,
449
+ error_message=error_message,
450
+ memory_helper=memory_helper,
451
+ memory_bank=memory_bank,
452
+ output_dir=args.output_dir,
453
+ status=status,
454
+ debug_store=debug_store,
455
+ )
456
+
457
+ # Generate evaluation report
458
+ report = evaluate_results(evaluation_results, args)
459
+
460
+ # Save detailed results
461
+ results_file = os.path.join(args.output_dir, 'evaluation_results.jsonl')
462
+ with open(results_file, 'w', encoding='utf-8') as f:
463
+ for result in evaluation_results:
464
+ f.write(json.dumps(result, ensure_ascii=False) + '\n')
465
+
466
+ # Save evaluation report
467
+ report_file = os.path.join(args.output_dir, 'evaluation_report.json')
468
+ with open(report_file, 'w', encoding='utf-8') as f:
469
+ json.dump(report, f, indent=2, ensure_ascii=False)
470
+
471
+ # Print summary
472
+ print(f"\n{'='*60}")
473
+ print("EVALUATION SUMMARY")
474
+ print(f"{'='*60}")
475
+ print(f"Total problems: {report['total_problems']}")
476
+ print(f"Correct: {report['correct']}")
477
+ print(f"Accuracy: {report['accuracy']:.2%}")
478
+ print(f"\nStatus breakdown:")
479
+ for status, count in sorted(report['status_counts'].items()):
480
+ print(f" {status:20s}: {count:3d} ({count/report['total_problems']:.1%})")
481
+ print(f"{'='*60}")
482
+ print(f"\nResults saved to:")
483
+ print(f" {results_file}")
484
+ print(f" {report_file}")
485
+
486
+
487
+ def parse_args():
488
+ parser = argparse.ArgumentParser(description="Execute and evaluate generated Gurobi code")
489
+
490
+ parser.add_argument("--input_file", type=str, required=True,
491
+ help="Path to generated results JSONL file")
492
+ parser.add_argument("--output_dir", type=str, required=True,
493
+ help="Directory to save execution results")
494
+ parser.add_argument("--timeout", type=int, default=60,
495
+ help="Timeout for code execution (seconds)")
496
+ parser.add_argument("--tolerance", type=float, default=0.05,
497
+ help="Tolerance for answer comparison")
498
+ parser.add_argument("--use_relative_tolerance", action="store_true",
499
+ help="Use relative tolerance (default: absolute)")
500
+ parser.add_argument("--save_output", action="store_true",
501
+ help="Save stdout/stderr in results")
502
+ parser.add_argument("--num_workers", type=int, default=100,
503
+ help="Number of parallel workers for execution")
504
+ parser.add_argument("--memory_dir", type=str, default=str(DEFAULT_MEMORY_DIR),
505
+ help="Path to episodic memory directory (used for debug suggestions)")
506
+ parser.add_argument("--embedding_model", type=str, default=None,
507
+ help="Optional embedding model name or local path for debug-memory retrieval")
508
+ parser.add_argument("--category_guidelines_path", type=str,
509
+ default=str(DEFAULT_GUIDELINES),
510
+ help="Path to category guideline JSONL file")
511
+ parser.add_argument("--debug_memory_path", type=str,
512
+ default=str(DEFAULT_DEBUG_MEMORY),
513
+ help="Path to persistent debug memory JSONL file")
514
+ parser.add_argument("--disable_debug_memory", action="store_true",
515
+ help="Disable memory-based debug suggestions")
516
+
517
+ return parser.parse_args()
518
+
519
+
520
+ if __name__ == "__main__":
521
+ args = parse_args()
522
+ main(args)
src/debate_memory/generate_with_memory.py ADDED
@@ -0,0 +1,920 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate with Memory: Single solution generation enhanced by memory retrieval
3
+ Based on simple_rag/generate.py + memory enhancement
4
+ """
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import re
10
+ from pathlib import Path
11
+ from collections import Counter
12
+ from typing import Dict, List, Optional
13
+ from tqdm import tqdm
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
+
16
+ # Import local utilities
17
+ from .llm import get_response
18
+ from .config import find_benchmark_path, get_prompt_template, normalize_dataset_name
19
+
20
+ # Import memory bank
21
+ from .memory_bank import MemoryBank
22
+ from .debug_memory import DebugMemoryStore
23
+ from .debug_executor import execute_generated_code, ExecutionResult
24
+
25
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
26
+ DEFAULT_MEMORY_DIR = PROJECT_ROOT / "memory_storage"
27
+ DEFAULT_DEBUG_MEMORY = DEFAULT_MEMORY_DIR / "debug_memory.jsonl"
28
+ DEFAULT_DEBUG_CASE_MEMORY = PROJECT_ROOT / "debug_case_memory"
29
+
30
+
31
+ class NoOpMemoryBank:
32
+ """Memory-bank stub used when retrieval is explicitly disabled."""
33
+
34
+ case_count = 0
35
+
36
+ def retrieve_similar_cases(self, query: str, top_k: int = 0):
37
+ return []
38
+
39
+ def format_retrieved_cases_for_prompt(self, similar_cases):
40
+ return ""
41
+
42
+
43
+ def load_dataset(dataset_name: str) -> List[Dict]:
44
+ """
45
+ Load dataset from the migrated benchmark directory layout.
46
+
47
+ Args:
48
+ dataset_name: Name of the dataset (e.g., "ComplexLP", "IndustryOR")
49
+
50
+ Returns:
51
+ List of problem dictionaries with 'description' and 'answer' fields
52
+ """
53
+ dataset_name = normalize_dataset_name(dataset_name)
54
+ dataset_path = find_benchmark_path(PROJECT_ROOT, dataset_name)
55
+
56
+ problems = []
57
+ with dataset_path.open('r', encoding='utf-8') as f:
58
+ for idx, line in enumerate(f):
59
+ if line.strip():
60
+ data = json.loads(line)
61
+ # Map en_question to description if it exists
62
+ if 'en_question' in data and 'description' not in data:
63
+ data['description'] = data['en_question']
64
+ # Map en_answer to answer if it exists
65
+ if 'en_answer' in data and 'answer' not in data:
66
+ data['answer'] = data['en_answer']
67
+ # Set id if not already present
68
+ if 'id' not in data:
69
+ data['id'] = idx
70
+ problems.append(data)
71
+
72
+ print(f"Loaded {len(problems)} problems from {dataset_name}")
73
+ return problems
74
+
75
+
76
+ def extract_python_code(text: str) -> str:
77
+ """
78
+ Extract Python code from LLM output
79
+ Looks for code within <python>...</python> tags or ```python...``` blocks
80
+
81
+ Args:
82
+ text: LLM output text
83
+
84
+ Returns:
85
+ Extracted Python code
86
+ """
87
+ # Try to extract from <python>...</python> tags first
88
+ pattern_xml = r'<python>(.*?)</python>'
89
+ match = re.search(pattern_xml, text, re.DOTALL | re.IGNORECASE)
90
+ if match:
91
+ code = match.group(1).strip()
92
+ # Remove markdown code fences if present
93
+ code = re.sub(r'^```python\s*\n', '', code)
94
+ code = re.sub(r'\n```\s*$', '', code)
95
+ return code
96
+
97
+ # Try to extract from ```python...``` blocks
98
+ pattern_markdown = r'```python(.*?)```'
99
+ match = re.search(pattern_markdown, text, re.DOTALL)
100
+ if match:
101
+ return match.group(1).strip()
102
+
103
+ # If no code blocks found, return empty string
104
+ return ""
105
+
106
+
107
+ def _truncate_text(text: str, limit: int = 1200) -> str:
108
+ if isinstance(text, bytes):
109
+ text = text.decode("utf-8", errors="replace")
110
+ snippet = (text or "").strip()
111
+ if not snippet:
112
+ return ""
113
+ if len(snippet) <= limit:
114
+ return snippet
115
+ return snippet[:limit] + "\n... (truncated)"
116
+
117
+
118
+ def write_debug_report(
119
+ problem_id: int,
120
+ description: str,
121
+ exec_result: ExecutionResult,
122
+ base_output_dir: str,
123
+ ) -> str:
124
+ debug_dir = os.path.join(base_output_dir, "debug")
125
+ os.makedirs(debug_dir, exist_ok=True)
126
+ path = os.path.join(debug_dir, f"problem_{problem_id}_debug.md")
127
+
128
+ stdout_snippet = _truncate_text(exec_result.stdout)
129
+ stderr_snippet = _truncate_text(exec_result.stderr)
130
+
131
+ lines = [
132
+ f"# Debug Report for Problem {problem_id}",
133
+ "",
134
+ f"- **Status:** {exec_result.status}",
135
+ ]
136
+ if exec_result.code_path:
137
+ rel_path = os.path.relpath(exec_result.code_path, base_output_dir)
138
+ lines.append(f"- **Code path:** {rel_path}")
139
+ if description:
140
+ lines.extend(["", "## Description", description.strip()])
141
+ if stdout_snippet:
142
+ lines.extend(["", "## Stdout", "```", stdout_snippet, "```"])
143
+ if stderr_snippet:
144
+ lines.extend(["", "## Stderr", "```", stderr_snippet, "```"])
145
+ if not stdout_snippet and not stderr_snippet:
146
+ lines.extend(["", "## Logs", "_No logs captured._"])
147
+
148
+ with open(path, "w", encoding="utf-8") as fh:
149
+ fh.write("\n".join(lines) + "\n")
150
+
151
+ return path
152
+
153
+
154
+ def filter_perfect_matches(similar_cases: List[Dict], current_description: str, max_filter: int = 1) -> List[Dict]:
155
+ """
156
+ Filter out cases with identical description (test set leakage)
157
+ At most max_filter cases will be removed (default: 1)
158
+
159
+ Args:
160
+ similar_cases: List of retrieved cases with scores
161
+ current_description: The description of current problem to compare against
162
+ max_filter: Maximum number of perfect matches to filter out (default: 1)
163
+
164
+ Returns:
165
+ Filtered list of cases
166
+ """
167
+ filtered = []
168
+ filtered_count = 0
169
+
170
+ for case in similar_cases:
171
+ case_desc = case['case'].get('description', '')
172
+ problem_id = case['case'].get('problem_id', '?')
173
+ score = case.get('score', 0.0)
174
+
175
+ # Compare descriptions directly (exact match)
176
+ # At most filter max_filter identical cases
177
+ if case_desc.strip() == current_description.strip() and filtered_count < max_filter:
178
+ filtered_count += 1
179
+ print(f" ⚠️ Filtered: Case ID={problem_id}, similarity={score:.4f} (identical description, test set leakage)")
180
+ else:
181
+ filtered.append(case)
182
+
183
+ if filtered_count > 0:
184
+ print(f" 📊 Filtered {filtered_count} perfect match(es) (max: {max_filter}), {len(filtered)} cases remaining")
185
+
186
+ return filtered
187
+
188
+
189
+ def refine_retrieved_cases_with_llm(
190
+ similar_cases: List[Dict],
191
+ current_problem_desc: str,
192
+ model: str,
193
+ temperature: float = 0.3
194
+ ) -> str:
195
+ """
196
+ Use LLM to analyze ALL retrieved cases together and extract key insights
197
+
198
+ This is a two-stage process:
199
+ 1. Retrieve similar cases (vector similarity)
200
+ 2. Use LLM to view ALL cases holistically and extract transferable insights
201
+
202
+ Args:
203
+ similar_cases: List of retrieved cases
204
+ current_problem_desc: Current problem description
205
+ model: Model name for analysis
206
+ temperature: Temperature for analysis (slightly higher for creativity)
207
+
208
+ Returns:
209
+ Refined insights as a string
210
+ """
211
+ if not similar_cases:
212
+ return ""
213
+
214
+ # Build full cases content (no truncation - show everything to LLM)
215
+ full_cases = ""
216
+ for i, item in enumerate(similar_cases, 1):
217
+ case = item['case']
218
+ score = item['score']
219
+ full_cases += f"\n{'='*70}\n"
220
+ full_cases += f"Case {i} (Similarity Score: {score:.3f})\n"
221
+ full_cases += f"{'='*70}\n\n"
222
+ full_cases += f"**Problem Description:**\n{case['description']}\n\n"
223
+ full_cases += f"**Complete Solution Code:**\n```python\n{case['solution_code']}\n```\n\n"
224
+ full_cases += f"**Objective Value:** {case['objective_value']}\n"
225
+ full_cases += f"**Status:** Correct ✓\n"
226
+ full_cases += "\n"
227
+
228
+ analysis_prompt = f"""You are an expert in optimization modeling. You will analyze multiple similar solved problems to extract **transferable insights** for a new problem.
229
+
230
+ ## Current Problem to Solve:
231
+ {current_problem_desc}
232
+
233
+ ## Retrieved Similar Cases (Complete):
234
+ {full_cases}
235
+
236
+ ## Your Task:
237
+
238
+ Analyze ALL the cases above **holistically** and provide a structured analysis that will guide solving the current problem.
239
+
240
+ **Focus on:**
241
+
242
+ 1. **Problem Type & Structure**: What category do these problems fall into? (e.g., production planning, resource allocation, scheduling, network flow)
243
+
244
+ 2. **Common Modeling Patterns**:
245
+ - What decision variables are typically used?
246
+ - What types of constraints appear repeatedly?
247
+ - How are objectives typically formulated?
248
+
249
+ 3. **Key Techniques & Tricks**:
250
+ - Any specific Gurobi features? (e.g., `addConstrs`, `quicksum`, binary variables, `setParam`)
251
+ - Modeling tricks? (e.g., big-M, indicator constraints, piecewise linear)
252
+ - Data structure patterns? (e.g., dictionaries for indices, list comprehensions)
253
+
254
+ 4. **Adaptation Guidance**:
255
+ - What aspects of the current problem are similar to the retrieved cases?
256
+ - What's different and requires new thinking?
257
+ - Which parts of the solution approaches can be directly adapted?
258
+
259
+ **Output Format**:
260
+ Provide a concise, actionable analysis (300-500 words) structured by the 4 points above. Be specific with code patterns and techniques, not just high-level descriptions.
261
+
262
+ **Important**: Extract **transferable knowledge**, not just summarize. Think about what the solver needs to know to adapt these solutions to the current problem."""
263
+
264
+ try:
265
+ analysis = get_response(analysis_prompt, model=model, temperature=temperature)
266
+ return analysis
267
+ except Exception as e:
268
+ print(f" ⚠️ Warning: Failed to refine cases with LLM: {e}")
269
+ # Fallback: return empty string, will use original formatting
270
+ return ""
271
+
272
+
273
+ def format_debug_cases_for_prompt(cases: List[Dict]) -> str:
274
+ if not cases:
275
+ return ""
276
+ lines = ["# Retrieved Debug Guidance", ""]
277
+ for idx, item in enumerate(cases, 1):
278
+ case = item["case"]
279
+ score = item.get("score")
280
+ signature = case.get("metadata", {}).get("signature", "unknown")
281
+ status = case.get("metadata", {}).get("status", "")
282
+ lines.append(f"## Case {idx} (similarity {score:.3f})")
283
+ lines.append(f"Signature: {signature} | Status: {status}")
284
+ description = case.get("description", "").strip()
285
+ if description:
286
+ lines.append(description if len(description) < 800 else description[:800] + "\n...")
287
+ lines.append("---")
288
+ return "\n".join(lines).strip()
289
+
290
+
291
+ def build_error_feedback_prompt(
292
+ exec_result: ExecutionResult,
293
+ attempt_number: int,
294
+ previous_code: str,
295
+ debug_guidance: str = ""
296
+ ) -> str:
297
+ """
298
+ Build a prompt with error feedback for code correction
299
+
300
+ Args:
301
+ exec_result: Execution result with error information
302
+ attempt_number: Current attempt number
303
+ previous_code: The code that failed
304
+
305
+ Returns:
306
+ Feedback prompt string
307
+ """
308
+ error_info = exec_result.stderr if exec_result.stderr else exec_result.stdout
309
+ if not error_info:
310
+ error_info = f"Status: {exec_result.status}"
311
+
312
+ feedback = f"""
313
+ # Code Execution Failed - Attempt {attempt_number}
314
+
315
+ Your previous code failed to execute successfully. Here is the error information:
316
+
317
+ ## Error Details:
318
+ ```
319
+ {error_info}
320
+ ```
321
+
322
+ ## Your Previous Code:
323
+ ```python
324
+ {previous_code}
325
+ ```
326
+
327
+ ## Instructions:
328
+ 1. Carefully analyze the error message above
329
+ 2. Identify the root cause of the error
330
+ 3. Fix the code to resolve the issue
331
+ 4. Common issues to check:
332
+ - Variable indexing (e.g., accessing index 0 when valid indices start from 1)
333
+ - Missing variable definitions
334
+ - Incorrect constraint formulations
335
+ - Type mismatches
336
+
337
+ Please provide the CORRECTED code in a ```python``` code block. Make sure to:
338
+ - Fix the specific error mentioned above
339
+ - Keep the overall structure and logic intact
340
+ - Ensure all variables are properly defined before use
341
+ """
342
+ if debug_guidance:
343
+ feedback += f"\n\n# Historical Debug Guidance\n{debug_guidance}\n"
344
+ return feedback
345
+
346
+
347
+ def generate_with_memory(
348
+ problem_id: int,
349
+ problem_desc: str,
350
+ memory_bank: MemoryBank,
351
+ model: str,
352
+ temperature: float,
353
+ top_k: int = 4,
354
+ filter_perfect: bool = True,
355
+ use_llm_refinement: bool = True,
356
+ *,
357
+ auto_debug: bool = True,
358
+ execution_timeout: int = 120,
359
+ debug_output_dir: Optional[str] = None,
360
+ debug_store: Optional[DebugMemoryStore] = None,
361
+ max_retries: int = 3,
362
+ debug_case_bank: Optional[MemoryBank] = None,
363
+ debug_case_top_k: int = 3
364
+ ) -> Dict:
365
+ """
366
+ Generate solution with memory enhancement
367
+
368
+ Args:
369
+ problem_id: Problem ID
370
+ problem_desc: Problem description
371
+ memory_bank: Memory bank instance
372
+ model: Model name
373
+ temperature: Generation temperature
374
+ top_k: Number of cases to retrieve (default: 4, will filter identical descriptions)
375
+ filter_perfect: Whether to filter out identical description matches
376
+ use_llm_refinement: Whether to use LLM to refine/summarize retrieved cases
377
+ auto_debug: Execute generated code and capture debug information
378
+ execution_timeout: Timeout (seconds) for executing generated code
379
+ debug_output_dir: Directory for storing debug artifacts (code, suggestions)
380
+ debug_store: Persistent store for debug experiences
381
+
382
+ Returns:
383
+ Dict with generation results
384
+ """
385
+ # Retrieve similar cases from memory
386
+ similar_cases = memory_bank.retrieve_similar_cases(problem_desc, top_k=top_k)
387
+ original_retrieved = len(similar_cases)
388
+
389
+ # Filter out identical descriptions (test set leakage)
390
+ if filter_perfect and similar_cases:
391
+ similar_cases = filter_perfect_matches(similar_cases, problem_desc)
392
+
393
+ # Prepare memory context
394
+ memory_context = ""
395
+ refined_insights = ""
396
+
397
+ if similar_cases:
398
+ if use_llm_refinement:
399
+ # Use LLM to analyze and refine the retrieved cases
400
+ print(f" 🧠 Using LLM to refine {len(similar_cases)} retrieved cases...")
401
+ refined_insights = refine_retrieved_cases_with_llm(
402
+ similar_cases, problem_desc, model, temperature=0.3
403
+ )
404
+
405
+ if refined_insights:
406
+ memory_context = f"""# Insights from Similar Problems in Memory
407
+
408
+ Based on analysis of {len(similar_cases)} similar problems, here are key insights:
409
+
410
+ {refined_insights}
411
+
412
+ ---
413
+
414
+ Please use these insights to guide your modeling approach for the current problem.
415
+ """
416
+ else:
417
+ # Fallback to original formatting if refinement fails
418
+ memory_context = memory_bank.format_retrieved_cases_for_prompt(similar_cases)
419
+ else:
420
+ # Use original formatting (full cases)
421
+ memory_context = memory_bank.format_retrieved_cases_for_prompt(similar_cases)
422
+
423
+ # Build prompt with memory context
424
+ prompt_template = get_prompt_template("default")
425
+ system_prompt = prompt_template["system"]
426
+ user_prompt = prompt_template["user"].format(question=problem_desc)
427
+
428
+ # Inject memory context if available
429
+ if memory_context:
430
+ user_prompt = f"{memory_context}\n\n{user_prompt}"
431
+
432
+ # Generate solution with self-healing retry mechanism
433
+ full_prompt = f"{system_prompt}\n\n{user_prompt}"
434
+
435
+ # Calculate prompt length for monitoring
436
+ prompt_length = len(full_prompt)
437
+ prompt_tokens_estimate = prompt_length // 4 # Rough estimate: 1 token ≈ 4 chars
438
+
439
+ # Variables to track across attempts
440
+ attempt_history = []
441
+ final_response = ''
442
+ final_code = ''
443
+ execution_status = 'not_executed'
444
+ execution_stdout = ''
445
+ execution_stderr = ''
446
+ execution_objective = None
447
+ execution_returncode = None
448
+ suggestions_path = ''
449
+ executed_code_path = ''
450
+ debug_signature = ''
451
+
452
+ try:
453
+ # Self-healing loop: try up to max_retries times
454
+ current_prompt = full_prompt
455
+
456
+ for attempt in range(1, max_retries + 1):
457
+ print(f" 🔄 Attempt {attempt}/{max_retries} for problem {problem_id}")
458
+
459
+ # Generate code
460
+ response = get_response(current_prompt, model=model, temperature=temperature)
461
+ code = extract_python_code(response)
462
+
463
+ # Record this attempt
464
+ attempt_info = {
465
+ 'attempt_number': attempt,
466
+ 'response': response,
467
+ 'code': code,
468
+ 'execution_status': 'not_executed',
469
+ }
470
+
471
+ if auto_debug and code.strip():
472
+ target_dir = debug_output_dir or os.path.join(os.getcwd(), "auto_debug")
473
+ os.makedirs(target_dir, exist_ok=True)
474
+
475
+ # Execute the generated code
476
+ exec_result = execute_generated_code(
477
+ code,
478
+ problem_id,
479
+ target_dir,
480
+ timeout=execution_timeout,
481
+ )
482
+
483
+ # Update attempt info
484
+ attempt_info['execution_status'] = exec_result.status
485
+ attempt_info['objective_value'] = exec_result.objective_value
486
+ attempt_info['stdout'] = exec_result.stdout[:200] if exec_result.stdout else ''
487
+ attempt_info['stderr'] = exec_result.stderr[:200] if exec_result.stderr else ''
488
+
489
+ # Check if execution was successful
490
+ if exec_result.status == 'success':
491
+ # Success! Use this result
492
+ print(f" ✅ Success on attempt {attempt}")
493
+ execution_status = exec_result.status
494
+ execution_stdout = exec_result.stdout
495
+ execution_stderr = exec_result.stderr
496
+ execution_objective = exec_result.objective_value
497
+ execution_returncode = exec_result.returncode
498
+ executed_code_path = exec_result.code_path or ''
499
+ final_response = response
500
+ final_code = code
501
+ attempt_history.append(attempt_info)
502
+ break # Exit the retry loop
503
+ else:
504
+ # Failure - prepare for retry
505
+ print(f" ❌ Failed on attempt {attempt}: {exec_result.status}")
506
+ execution_status = exec_result.status
507
+ execution_stdout = exec_result.stdout
508
+ execution_stderr = exec_result.stderr
509
+ execution_returncode = exec_result.returncode
510
+ executed_code_path = exec_result.code_path or ''
511
+ final_response = response
512
+ final_code = code
513
+
514
+ # Write debug report
515
+ suggestions_path = write_debug_report(
516
+ problem_id,
517
+ problem_desc,
518
+ exec_result,
519
+ target_dir,
520
+ )
521
+
522
+ # Record to debug store
523
+ error_message = execution_stderr or execution_stdout or execution_status
524
+ if debug_store:
525
+ debug_signature = debug_store.record_execution_feedback(
526
+ problem_id=problem_id,
527
+ description=problem_desc,
528
+ status=execution_status,
529
+ error_text=error_message,
530
+ guidance=f"Attempt {attempt}/{max_retries} failed. Review the debug report.",
531
+ source="generate_with_memory.auto_debug.self_healing",
532
+ metadata={
533
+ "attempt": attempt,
534
+ "returncode": execution_returncode,
535
+ "code_path": executed_code_path,
536
+ },
537
+ )
538
+
539
+ attempt_history.append(attempt_info)
540
+
541
+ # If not the last attempt, prepare retry prompt
542
+ if attempt < max_retries:
543
+ guidance_text = ""
544
+ if debug_case_bank and error_message:
545
+ debug_cases = debug_case_bank.retrieve_similar_cases(
546
+ error_message,
547
+ top_k=debug_case_top_k,
548
+ )
549
+ guidance_text = format_debug_cases_for_prompt(debug_cases)
550
+ error_feedback = build_error_feedback_prompt(
551
+ exec_result,
552
+ attempt,
553
+ code,
554
+ debug_guidance=guidance_text,
555
+ )
556
+ # Append error feedback to the prompt for next attempt
557
+ current_prompt = f"{full_prompt}\n\n{error_feedback}"
558
+ print(f" 🔧 Preparing retry with error feedback...")
559
+ else:
560
+ print(f" ⚠️ Max retries ({max_retries}) reached, giving up")
561
+
562
+ elif not code.strip():
563
+ # No code generated
564
+ attempt_info['execution_status'] = 'no_code'
565
+ attempt_history.append(attempt_info)
566
+ execution_status = 'no_code'
567
+ execution_stderr = 'Generated code block is empty.'
568
+ final_response = response
569
+ final_code = code
570
+
571
+ if attempt < max_retries:
572
+ # Retry with feedback about missing code
573
+ feedback = "\n\nYour previous response did not contain any Python code. Please provide the complete Gurobi code in a ```python``` code block."
574
+ current_prompt = f"{full_prompt}\n\n{feedback}"
575
+ print(f" ⚠️ No code generated, retrying...")
576
+ else:
577
+ print(f" ⚠️ Max retries reached, no code generated")
578
+ break
579
+
580
+ elif not auto_debug:
581
+ # Auto debug disabled, just use the generated code
582
+ execution_status = 'skipped'
583
+ final_response = response
584
+ final_code = code
585
+ attempt_history.append(attempt_info)
586
+ break
587
+
588
+ if auto_debug:
589
+ if execution_status == 'success':
590
+ final_status = 'success'
591
+ elif final_code.strip():
592
+ final_status = 'execution_failed'
593
+ else:
594
+ final_status = 'no_code'
595
+ else:
596
+ final_status = 'success' if final_code.strip() else 'no_code'
597
+
598
+ return {
599
+ 'id': problem_id,
600
+ 'model': model,
601
+ 'temperature': temperature,
602
+ 'description': problem_desc,
603
+ 'full_input_prompt': full_prompt, # 💾 Complete input for reproducibility
604
+ 'refined_insights': refined_insights if use_llm_refinement else '', # LLM-refined insights
605
+ 'prompt_length_chars': prompt_length,
606
+ 'prompt_length_tokens_est': prompt_tokens_estimate,
607
+ 'raw_response': final_response,
608
+ 'generated_code': final_code,
609
+ 'retrieved_cases': len(similar_cases),
610
+ 'original_retrieved': original_retrieved,
611
+ 'use_llm_refinement': use_llm_refinement,
612
+ 'status': final_status,
613
+ 'execution_status': execution_status,
614
+ 'execution_stdout': execution_stdout,
615
+ 'execution_stderr': execution_stderr,
616
+ 'execution_objective_value': execution_objective,
617
+ 'execution_returncode': execution_returncode,
618
+ 'debug_suggestions_path': suggestions_path,
619
+ 'executed_code_path': executed_code_path if executed_code_path else '',
620
+ 'debug_signature': debug_signature,
621
+ 'auto_debug_enabled': auto_debug,
622
+ 'execution_timeout_sec': execution_timeout if auto_debug else None,
623
+ 'max_retries': max_retries,
624
+ 'total_attempts': len(attempt_history),
625
+ 'attempt_history': attempt_history,
626
+ 'self_healing_enabled': True,
627
+ }
628
+
629
+ except Exception as e:
630
+ print(f"Error generating solution for problem {problem_id}: {e}")
631
+
632
+ # Still save the prompt even on error
633
+ full_prompt = f"{system_prompt}\n\n{user_prompt}"
634
+
635
+ return {
636
+ 'id': problem_id,
637
+ 'model': model,
638
+ 'temperature': temperature,
639
+ 'description': problem_desc,
640
+ 'full_input_prompt': full_prompt, # Save even on error
641
+ 'refined_insights': '',
642
+ 'prompt_length_chars': len(full_prompt),
643
+ 'prompt_length_tokens_est': len(full_prompt) // 4,
644
+ 'raw_response': '',
645
+ 'generated_code': '',
646
+ 'retrieved_cases': len(similar_cases) if similar_cases else 0,
647
+ 'original_retrieved': original_retrieved,
648
+ 'use_llm_refinement': use_llm_refinement,
649
+ 'status': 'error',
650
+ 'error': str(e),
651
+ 'execution_status': 'not_executed',
652
+ 'execution_stdout': '',
653
+ 'execution_stderr': '',
654
+ 'execution_objective_value': None,
655
+ 'execution_returncode': None,
656
+ 'debug_suggestions_path': '',
657
+ 'executed_code_path': '',
658
+ 'debug_signature': '',
659
+ 'auto_debug_enabled': auto_debug,
660
+ 'execution_timeout_sec': execution_timeout if auto_debug else None,
661
+ 'max_retries': max_retries,
662
+ 'total_attempts': 0,
663
+ 'attempt_history': [],
664
+ 'self_healing_enabled': True,
665
+ }
666
+
667
+
668
+ def generate_single_problem(
669
+ problem: Dict,
670
+ memory_bank: MemoryBank,
671
+ model: str,
672
+ temperature: float,
673
+ top_k: int,
674
+ filter_perfect: bool,
675
+ use_llm_refinement: bool,
676
+ *,
677
+ auto_debug: bool,
678
+ execution_timeout: int,
679
+ debug_output_dir: Optional[str],
680
+ debug_store: Optional[DebugMemoryStore],
681
+ max_retries: int = 3,
682
+ debug_case_bank: Optional[MemoryBank] = None,
683
+ debug_case_top_k: int = 3,
684
+ ) -> Dict:
685
+ """
686
+ Wrapper for parallel execution
687
+ """
688
+ problem_id = problem['id']
689
+ problem_desc = problem['description']
690
+
691
+ result = generate_with_memory(
692
+ problem_id, problem_desc, memory_bank,
693
+ model, temperature, top_k, filter_perfect, use_llm_refinement,
694
+ auto_debug=auto_debug,
695
+ execution_timeout=execution_timeout,
696
+ debug_output_dir=debug_output_dir,
697
+ debug_store=debug_store,
698
+ max_retries=max_retries,
699
+ debug_case_bank=debug_case_bank,
700
+ debug_case_top_k=debug_case_top_k,
701
+ )
702
+
703
+ # Add ground truth
704
+ result['answer'] = problem.get('answer', '')
705
+
706
+ return result
707
+
708
+
709
+ def main():
710
+ parser = argparse.ArgumentParser(description='Generate with Memory (parallel single solutions)')
711
+ parser.add_argument('--dataset', type=str, default='IndustryOR',
712
+ help='Dataset name')
713
+ parser.add_argument('--model', type=str, default='gpt-4o',
714
+ help='Model name')
715
+ parser.add_argument('--temperature', type=float, default=0.01,
716
+ help='Temperature for generation')
717
+ parser.add_argument('--max_problems', type=int, default=None,
718
+ help='Maximum number of problems to process')
719
+ parser.add_argument('--output', type=str, required=True,
720
+ help='Output file path (JSONL)')
721
+ parser.add_argument('--memory_dir', type=str, default=str(DEFAULT_MEMORY_DIR),
722
+ help='Memory storage directory')
723
+ parser.add_argument('--embedding_model', type=str, default=None,
724
+ help='Optional embedding model name or local path for memory retrieval')
725
+ parser.add_argument('--memory_top_k', type=int, default=4,
726
+ help='Number of cases to retrieve from memory (default: 4)')
727
+ parser.add_argument('--no_filter_perfect', action='store_true',
728
+ help='Disable filtering of perfect similarity matches')
729
+ parser.add_argument('--use_llm_refinement', action='store_true',
730
+ help='Use LLM to refine/summarize retrieved cases (improves quality, costs more API calls)')
731
+ parser.add_argument('--parallel', type=int, default=5,
732
+ help='Number of parallel workers')
733
+ parser.add_argument('--execution_timeout', type=int, default=120,
734
+ help='Timeout (seconds) for executing generated code during auto-debug')
735
+ parser.add_argument('--no_auto_debug', action='store_true',
736
+ help='Disable automatic execution and debug capture for generated code')
737
+ parser.add_argument('--debug_output_dir', type=str, default=None,
738
+ help='Directory to store auto-debug artifacts (code, logs, suggestions)')
739
+ parser.add_argument('--debug_memory_path', type=str, default=str(DEFAULT_DEBUG_MEMORY),
740
+ help='Path to persistent debug memory JSONL file')
741
+ parser.add_argument('--debug_case_memory_dir', type=str, default=str(DEFAULT_DEBUG_CASE_MEMORY),
742
+ help='Directory containing consolidated debug-case memory (built via build_debug_memory.py)')
743
+ parser.add_argument('--debug_case_memory_top_k', type=int, default=3,
744
+ help='How many debug memory cases to retrieve when execution fails')
745
+ parser.add_argument('--max_retries', type=int, default=3,
746
+ help='Maximum number of retry attempts for self-healing (default: 3)')
747
+
748
+ args = parser.parse_args()
749
+
750
+ args.dataset = normalize_dataset_name(args.dataset)
751
+
752
+ auto_debug_enabled = not args.no_auto_debug
753
+ debug_output_dir = args.debug_output_dir
754
+ debug_store: Optional[DebugMemoryStore] = None
755
+ if auto_debug_enabled:
756
+ if debug_output_dir is None:
757
+ base_dir = os.path.dirname(args.output) or '.'
758
+ debug_output_dir = os.path.join(base_dir, 'auto_debug')
759
+ os.makedirs(debug_output_dir, exist_ok=True)
760
+ debug_store = DebugMemoryStore(args.debug_memory_path)
761
+ else:
762
+ debug_output_dir = None
763
+
764
+ debug_case_bank: Optional[MemoryBank] = None
765
+ if auto_debug_enabled and args.debug_case_memory_top_k > 0 and args.debug_case_memory_dir:
766
+ case_dir = Path(args.debug_case_memory_dir)
767
+ if case_dir.exists():
768
+ try:
769
+ if args.embedding_model:
770
+ debug_case_bank = MemoryBank(
771
+ memory_dir=str(case_dir),
772
+ embedding_model=args.embedding_model,
773
+ )
774
+ else:
775
+ debug_case_bank = MemoryBank(memory_dir=str(case_dir))
776
+ except Exception as exc: # noqa: BLE001
777
+ print(f"⚠️ Warning: failed to load debug-case memory from {case_dir} ({exc})")
778
+ else:
779
+ print(f"ℹ️ Debug-case memory directory not found: {case_dir} (skipping retrieval)")
780
+
781
+ print("="*80)
782
+ print("🧠 Generate with Memory (Parallel)")
783
+ print("="*80)
784
+ print(f"Dataset: {args.dataset}")
785
+ print(f"Model: {args.model}")
786
+ print(f"Temperature: {args.temperature}")
787
+ print(f"Memory dir: {args.memory_dir}")
788
+ if args.embedding_model:
789
+ print(f"Embedding: {args.embedding_model}")
790
+ print(f"Memory Top-K: {args.memory_top_k}")
791
+ print(f"Filter perfect matches: {not args.no_filter_perfect}")
792
+ print(f"LLM Refinement: {'✅ Enabled' if args.use_llm_refinement else '❌ Disabled'}")
793
+ print(f"Parallel: {args.parallel}")
794
+ print(f"Output: {args.output}")
795
+ print(f"Auto Debug: {'✅ Enabled' if auto_debug_enabled else '❌ Disabled'}")
796
+ if auto_debug_enabled:
797
+ print(f" Debug dir: {debug_output_dir}")
798
+ if args.debug_memory_path:
799
+ print(f" Debug memory: {args.debug_memory_path}")
800
+ print(f" Exec timeout: {args.execution_timeout}s")
801
+ print(f" Max retries: {args.max_retries} (Self-healing enabled)")
802
+ print("="*80)
803
+ print()
804
+
805
+ # Initialize memory bank only when retrieval is active.
806
+ if args.memory_top_k > 0:
807
+ print("Initializing memory bank...")
808
+ if args.embedding_model:
809
+ memory_bank = MemoryBank(memory_dir=args.memory_dir, embedding_model=args.embedding_model)
810
+ else:
811
+ memory_bank = MemoryBank(memory_dir=args.memory_dir)
812
+ print()
813
+ else:
814
+ print("Skipping memory bank initialization because memory_top_k=0")
815
+ print()
816
+ memory_bank = NoOpMemoryBank()
817
+
818
+ # Load dataset
819
+ problems = load_dataset(args.dataset)
820
+ if args.max_problems:
821
+ problems = problems[:args.max_problems]
822
+
823
+ print(f"Processing {len(problems)} problems with {args.parallel} workers")
824
+ print()
825
+
826
+ # Create output directory
827
+ os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True)
828
+
829
+ # Parallel generation
830
+ results = []
831
+
832
+ if args.parallel <= 1:
833
+ # Sequential processing
834
+ for problem in tqdm(problems, desc="Generating"):
835
+ result = generate_single_problem(
836
+ problem, memory_bank, args.model, args.temperature,
837
+ args.memory_top_k, not args.no_filter_perfect, args.use_llm_refinement,
838
+ auto_debug=auto_debug_enabled,
839
+ execution_timeout=args.execution_timeout,
840
+ debug_output_dir=debug_output_dir,
841
+ debug_store=debug_store,
842
+ max_retries=args.max_retries,
843
+ debug_case_bank=debug_case_bank,
844
+ debug_case_top_k=args.debug_case_memory_top_k,
845
+ )
846
+ results.append(result)
847
+ else:
848
+ # Parallel processing
849
+ with ThreadPoolExecutor(max_workers=args.parallel) as executor:
850
+ futures = {
851
+ executor.submit(
852
+ generate_single_problem,
853
+ problem, memory_bank, args.model, args.temperature,
854
+ args.memory_top_k, not args.no_filter_perfect, args.use_llm_refinement,
855
+ auto_debug=auto_debug_enabled,
856
+ execution_timeout=args.execution_timeout,
857
+ debug_output_dir=debug_output_dir,
858
+ debug_store=debug_store,
859
+ max_retries=args.max_retries,
860
+ debug_case_bank=debug_case_bank,
861
+ debug_case_top_k=args.debug_case_memory_top_k,
862
+ ): problem for problem in problems
863
+ }
864
+
865
+ for future in tqdm(as_completed(futures), total=len(problems), desc="Generating"):
866
+ try:
867
+ result = future.result()
868
+ results.append(result)
869
+ except Exception as e:
870
+ problem = futures[future]
871
+ print(f"Error processing problem {problem['id']}: {e}")
872
+
873
+ # Sort by problem ID
874
+ results.sort(key=lambda x: x['id'])
875
+
876
+ # Save results
877
+ with open(args.output, 'w', encoding='utf-8') as f:
878
+ for result in results:
879
+ f.write(json.dumps(result, ensure_ascii=False) + '\n')
880
+
881
+ print()
882
+ print("="*80)
883
+ print("✅ Generation Complete")
884
+ print("="*80)
885
+ print(f"Total problems: {len(results)}")
886
+ status_counts = Counter(r.get('status', 'unknown') for r in results)
887
+ print(f"Successful: {status_counts.get('success', 0)}")
888
+ print(f"Errors: {status_counts.get('error', 0)}")
889
+ print(f"Results saved to: {args.output}")
890
+ if status_counts:
891
+ print("Status breakdown:")
892
+ for status, count in sorted(status_counts.items()):
893
+ print(f" {status:<18}: {count}")
894
+
895
+ # Memory statistics
896
+ total_retrieved = sum(r.get('retrieved_cases', 0) for r in results)
897
+ total_original = sum(r.get('original_retrieved', 0) for r in results)
898
+ filtered = total_original - total_retrieved
899
+
900
+ # Prompt length statistics
901
+ prompt_lengths = [r.get('prompt_length_tokens_est', 0) for r in results if r.get('status') == 'success']
902
+ avg_prompt_tokens = sum(prompt_lengths) / len(prompt_lengths) if prompt_lengths else 0
903
+ max_prompt_tokens = max(prompt_lengths) if prompt_lengths else 0
904
+
905
+ print()
906
+ print("Memory Statistics:")
907
+ print(f" Total retrievals: {total_original}")
908
+ print(f" After filtering: {total_retrieved}")
909
+ print(f" Filtered out: {filtered} (perfect matches)")
910
+ print(f" Avg per problem: {total_retrieved / len(results):.2f}")
911
+ print()
912
+ print("Prompt Length Statistics:")
913
+ print(f" Avg prompt tokens: {avg_prompt_tokens:.0f}")
914
+ print(f" Max prompt tokens: {max_prompt_tokens:.0f}")
915
+ print(f" ℹ️ All prompts saved in 'full_input_prompt' field")
916
+ print("="*80)
917
+
918
+
919
+ if __name__ == "__main__":
920
+ main()
src/debate_memory/llm.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Lightweight HTTP client for OpenAI-compatible chat completions.
3
+
4
+ - Credentials are read from environment variables only.
5
+ - Supported environment variables:
6
+ * `LLM_API_BASE_URL`
7
+ * `LLM_API_KEY`
8
+ * `OPENAI_BASE_URL`
9
+ * `OPENAI_API_KEY`
10
+ * `API_URL`
11
+ * `API_KEY`
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ import time
19
+ from typing import Dict, List
20
+
21
+ import requests
22
+
23
+ def _get_credentials() -> Dict[str, str]:
24
+ api_key = (
25
+ os.getenv("LLM_API_KEY")
26
+ or os.getenv("OPENAI_API_KEY")
27
+ or os.getenv("API_KEY")
28
+ )
29
+ base_url = (
30
+ os.getenv("LLM_API_BASE_URL")
31
+ or os.getenv("OPENAI_BASE_URL")
32
+ or os.getenv("API_URL")
33
+ )
34
+ if not api_key:
35
+ raise RuntimeError(
36
+ "Missing API key. Set one of: LLM_API_KEY, OPENAI_API_KEY, API_KEY."
37
+ )
38
+ if not base_url:
39
+ raise RuntimeError(
40
+ "Missing API base URL. Set one of: "
41
+ "LLM_API_BASE_URL, OPENAI_BASE_URL, API_URL."
42
+ )
43
+ return {"api_key": api_key, "base_url": base_url.rstrip("/")}
44
+
45
+
46
+ def _post_chat_completion(
47
+ messages: List[Dict[str, str]],
48
+ model: str,
49
+ temperature: float,
50
+ max_tokens: int,
51
+ ) -> Dict:
52
+ creds = _get_credentials()
53
+ url = f"{creds['base_url']}/chat/completions"
54
+ headers = {
55
+ "Authorization": f"Bearer {creds['api_key']}",
56
+ "Content-Type": "application/json",
57
+ }
58
+ payload = {
59
+ "model": model,
60
+ "messages": messages,
61
+ "temperature": temperature,
62
+ "max_tokens": max_tokens,
63
+ }
64
+ response = requests.post(url, headers=headers, json=payload, timeout=120)
65
+ response.raise_for_status()
66
+ try:
67
+ return response.json()
68
+ except json.JSONDecodeError as exc: # pragma: no cover - defensive
69
+ raise RuntimeError(f"Non-JSON response from LLM API: {response.text[:200]}") from exc
70
+
71
+
72
+ def _extract_content(result: Dict) -> str:
73
+ choices = result.get("choices")
74
+ if not choices:
75
+ raise RuntimeError(f"LLM API response missing 'choices': {result}")
76
+ message = choices[0].get("message") or {}
77
+ content = message.get("content")
78
+ if content is None:
79
+ raise RuntimeError(f"LLM API response missing message content: {result}")
80
+ return content
81
+
82
+
83
+ def get_response(prompt: str, model: str, temperature: float = 0.01, maximum_retries: int = 10) -> str:
84
+ """
85
+ Send a chat completion request using OpenAI-compatible REST calls.
86
+ """
87
+ if model.startswith("deepseek"):
88
+ real_model = model.replace("-chat", "-v3").replace("-reasoner", "-r1")
89
+ else:
90
+ real_model = model
91
+
92
+ attempts = max(1, maximum_retries)
93
+ last_error: Exception | None = None
94
+ while attempts > 0:
95
+ try:
96
+ result = _post_chat_completion(
97
+ messages=[{"role": "user", "content": prompt}],
98
+ model=real_model,
99
+ temperature=temperature,
100
+ max_tokens=16384,
101
+ )
102
+ return _extract_content(result)
103
+ except Exception as exc: # noqa: BLE001
104
+ last_error = exc
105
+ attempts -= 1
106
+ if attempts == 0:
107
+ break
108
+ print(f"Error using API: {exc}. Retrying...")
109
+ time.sleep(2)
110
+
111
+ raise RuntimeError(f"Failed to get response from API after retries: {last_error}")
src/debate_memory/memory_bank.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Memory Bank for storing and retrieving successful problem-solving cases
3
+ Uses LlamaIndex for RAG-based case retrieval
4
+ """
5
+
6
+ import os
7
+ import json
8
+ from pathlib import Path
9
+ from typing import List, Dict, Optional
10
+ from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage
11
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
12
+ from llama_index.core import Settings
13
+
14
+ _PKG_DIR = Path(__file__).resolve().parent
15
+ _PROJECT_ROOT = _PKG_DIR.parent.parent
16
+ DEFAULT_MEMORY_DIR = str(_PROJECT_ROOT / "memory_storage")
17
+
18
+
19
+ class MemoryBank:
20
+ """
21
+ Memory Bank for storing successful problem-solving experiences
22
+
23
+ Design inspired by Memento (https://arxiv.org/pdf/2508.16153):
24
+ - Episodic memory: Store past successful trajectories
25
+ - Case-based reasoning: Retrieve similar cases to guide current problem
26
+ - Non-parametric: No gradient updates, just memory read/write
27
+ """
28
+
29
+ def __init__(self, memory_dir: str = DEFAULT_MEMORY_DIR, embedding_model: str = "BAAI/bge-small-en-v1.5"):
30
+ """
31
+ Initialize Memory Bank
32
+
33
+ Args:
34
+ memory_dir: Directory to store memory index and cases
35
+ embedding_model: HuggingFace embedding model name or local path
36
+ """
37
+ self.memory_dir = memory_dir
38
+ os.makedirs(memory_dir, exist_ok=True)
39
+
40
+ self.cases_file = os.path.join(memory_dir, "cases.jsonl")
41
+ self.index_dir = os.path.join(memory_dir, "index")
42
+
43
+ # Configure embedding model with local caching
44
+ # Set cache_folder to use llama_index's cache directory
45
+ # Set trust_remote_code to False for security
46
+ # If embedding_model is a local path, use it directly
47
+ # Otherwise, try to use cached model to avoid network requests
48
+ os.environ.setdefault("HF_HUB_OFFLINE", "0") # Allow online access by default
49
+
50
+ # Check if embedding_model is a local file path
51
+ is_local_path = os.path.isabs(embedding_model) or (os.path.sep in embedding_model and os.path.exists(embedding_model))
52
+
53
+ try:
54
+ # If it's a local path, use it directly
55
+ if is_local_path:
56
+ print(f"📁 Using local embedding model from: {embedding_model}")
57
+ Settings.embed_model = HuggingFaceEmbedding(
58
+ model_name=embedding_model,
59
+ cache_folder=os.path.expanduser("~/.cache/llama_index"),
60
+ trust_remote_code=False
61
+ )
62
+ else:
63
+ # Try to load from cache first to avoid network requests
64
+ # Set HF_HUB_OFFLINE=1 to force local-only mode
65
+ print(f"🔍 Loading embedding model: {embedding_model}")
66
+ print(" (If you want to avoid Hugging Face downloads, set HF_HUB_OFFLINE=1 or use a local model path)")
67
+ Settings.embed_model = HuggingFaceEmbedding(
68
+ model_name=embedding_model,
69
+ cache_folder=os.path.expanduser("~/.cache/llama_index"),
70
+ trust_remote_code=False
71
+ )
72
+ except Exception as e:
73
+ # If model loading fails, try to use cached model only
74
+ print(f"⚠️ Warning: Failed to load embedding model '{embedding_model}': {e}")
75
+ print(" Attempting to use cached model only (setting HF_HUB_OFFLINE=1)...")
76
+ os.environ["HF_HUB_OFFLINE"] = "1"
77
+ try:
78
+ Settings.embed_model = HuggingFaceEmbedding(
79
+ model_name=embedding_model,
80
+ cache_folder=os.path.expanduser("~/.cache/llama_index"),
81
+ trust_remote_code=False
82
+ )
83
+ print(" ✅ Using cached model")
84
+ except Exception as e2:
85
+ print(f"❌ Error: Could not load embedding model: {e2}")
86
+ print(" Please either:")
87
+ print(" 1. Download the model first: python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')\"")
88
+ print(" 2. Set HF_HUB_OFFLINE=1 and ensure the model is cached")
89
+ print(" 3. Use a local model path: --embedding_model /path/to/local/model")
90
+ raise
91
+ # Disable chunking to ensure one document = one node (no duplicates)
92
+ Settings.chunk_size = 8192 # Large enough to never split
93
+ Settings.chunk_overlap = 0
94
+
95
+ # Load or create index
96
+ self.index = self._load_or_create_index()
97
+ self.case_count = self._count_cases()
98
+
99
+ print(f"Memory Bank initialized with {self.case_count} cases")
100
+
101
+ def _load_or_create_index(self):
102
+ """Load existing index or create new one"""
103
+ if os.path.exists(self.index_dir):
104
+ try:
105
+ storage_context = StorageContext.from_defaults(persist_dir=self.index_dir)
106
+ index = load_index_from_storage(storage_context)
107
+ print(f"Loaded existing memory index from {self.index_dir}")
108
+ return index
109
+ except:
110
+ print("Failed to load index, creating new one")
111
+
112
+ # Create new empty index
113
+ documents = []
114
+ index = VectorStoreIndex.from_documents(documents)
115
+ os.makedirs(self.index_dir, exist_ok=True)
116
+ index.storage_context.persist(persist_dir=self.index_dir)
117
+ print(f"Created new memory index at {self.index_dir}")
118
+ return index
119
+
120
+ def _count_cases(self) -> int:
121
+ """Count number of cases in memory"""
122
+ if not os.path.exists(self.cases_file):
123
+ return 0
124
+ with open(self.cases_file, 'r') as f:
125
+ return sum(1 for _ in f)
126
+
127
+ def add_case(self, problem_id: int, problem_desc: str, solution_code: str,
128
+ objective_value: float, is_correct: bool, metadata: Optional[Dict] = None):
129
+ """
130
+ Add a successful case to memory
131
+
132
+ Args:
133
+ problem_id: Problem ID
134
+ problem_desc: Problem description
135
+ solution_code: Solution code
136
+ objective_value: Computed objective value
137
+ is_correct: Whether the solution is correct
138
+ metadata: Additional metadata (model, debate_rounds, etc.)
139
+ """
140
+ if not is_correct:
141
+ # Only store successful cases
142
+ return
143
+
144
+ case = {
145
+ 'problem_id': problem_id,
146
+ 'description': problem_desc,
147
+ 'solution_code': solution_code,
148
+ 'objective_value': objective_value,
149
+ 'is_correct': is_correct,
150
+ 'metadata': metadata or {}
151
+ }
152
+
153
+ # Write to cases file
154
+ with open(self.cases_file, 'a', encoding='utf-8') as f:
155
+ f.write(json.dumps(case, ensure_ascii=False) + '\n')
156
+
157
+ # Create document for indexing
158
+ # Combine description and key solution insights for better retrieval
159
+ doc_text = f"""Problem: {problem_desc}
160
+
161
+ Solution approach:
162
+ {solution_code[:500]}...
163
+
164
+ Key features:
165
+ - Problem ID: {problem_id}
166
+ - Objective value: {objective_value}
167
+ - Status: Correct
168
+ """
169
+
170
+ doc = Document(
171
+ text=doc_text,
172
+ metadata={
173
+ 'problem_id': problem_id,
174
+ 'objective_value': objective_value,
175
+ **case['metadata']
176
+ }
177
+ )
178
+
179
+ # Add to index
180
+ self.index.insert(doc)
181
+ self.index.storage_context.persist(persist_dir=self.index_dir)
182
+
183
+ self.case_count += 1
184
+ print(f"✅ Added case {problem_id} to memory (Total: {self.case_count})")
185
+
186
+ def retrieve_similar_cases(self, query: str, top_k: int = 3, preferred_dataset: Optional[str] = None) -> List[Dict]:
187
+ """
188
+ Retrieve similar cases from memory using RAG based on semantic similarity
189
+
190
+ Args:
191
+ query: Query text (usually the problem description)
192
+ top_k: Number of similar cases to retrieve (0 = no retrieval)
193
+ preferred_dataset: Preferred dataset name to prioritize (optional)
194
+
195
+ Returns:
196
+ List of similar cases with scores, sorted by semantic similarity
197
+ """
198
+ if self.case_count == 0 or top_k <= 0:
199
+ return []
200
+
201
+ # Query the index - purely based on semantic similarity
202
+ retriever = self.index.as_retriever(similarity_top_k=top_k * 2 if preferred_dataset else top_k)
203
+ nodes = retriever.retrieve(query)
204
+
205
+ # Load corresponding cases from cases.jsonl based on semantic similarity
206
+ similar_cases = []
207
+ seen_keys = set() # Track which (problem_id, dataset) combinations we've added
208
+
209
+ # If preferred_dataset is specified, prioritize those cases
210
+ preferred_cases = []
211
+ other_cases = []
212
+
213
+ for node in nodes:
214
+ problem_id = node.metadata.get('problem_id')
215
+ score = node.score
216
+ node_dataset = node.metadata.get('dataset', '')
217
+
218
+ # Build key for deduplication
219
+ case_key = (problem_id, node_dataset)
220
+ if case_key in seen_keys:
221
+ continue
222
+
223
+ # Load the case - use dataset from node metadata to get the exact match
224
+ case_data = None
225
+ if node_dataset:
226
+ # Try to load by problem_id and dataset (more precise)
227
+ case_data = self._load_case_by_id_and_dataset(problem_id, node_dataset)
228
+
229
+ if not case_data:
230
+ # Fallback: try to load by problem_id only
231
+ case_data = self._load_case_by_id(problem_id)
232
+
233
+ if case_data:
234
+ seen_keys.add(case_key)
235
+ case_item = {
236
+ 'case': case_data,
237
+ 'score': score,
238
+ 'text_preview': node.text[:200]
239
+ }
240
+
241
+ # Separate preferred dataset cases from others
242
+ if preferred_dataset and node_dataset == preferred_dataset:
243
+ preferred_cases.append(case_item)
244
+ else:
245
+ other_cases.append(case_item)
246
+
247
+ # Combine: preferred cases first, then others, all sorted by similarity score
248
+ similar_cases = preferred_cases + other_cases
249
+
250
+ # Return top_k results
251
+ return similar_cases[:top_k]
252
+
253
+ def _load_case_by_id(self, problem_id: int) -> Optional[Dict]:
254
+ """Load a specific case by problem ID (returns first match)"""
255
+ if not os.path.exists(self.cases_file):
256
+ return None
257
+
258
+ with open(self.cases_file, 'r', encoding='utf-8') as f:
259
+ for line in f:
260
+ case = json.loads(line)
261
+ if case['problem_id'] == problem_id:
262
+ return case
263
+ return None
264
+
265
+ def _load_case_by_id_and_dataset(self, problem_id: int, dataset: str) -> Optional[Dict]:
266
+ """Load a specific case by problem ID and dataset"""
267
+ if not os.path.exists(self.cases_file):
268
+ return None
269
+
270
+ with open(self.cases_file, 'r', encoding='utf-8') as f:
271
+ for line in f:
272
+ case = json.loads(line)
273
+ if case['problem_id'] == problem_id:
274
+ case_dataset = case.get('metadata', {}).get('dataset', '')
275
+ if case_dataset == dataset:
276
+ return case
277
+ return None
278
+
279
+ def get_memory_stats(self) -> Dict:
280
+ """Get memory bank statistics"""
281
+ return {
282
+ 'total_cases': self.case_count,
283
+ 'memory_dir': self.memory_dir,
284
+ 'cases_file': self.cases_file,
285
+ 'index_dir': self.index_dir
286
+ }
287
+
288
+ def format_retrieved_cases_for_prompt(self, cases: List[Dict]) -> str:
289
+ """
290
+ Format retrieved cases for inclusion in LLM prompt
291
+
292
+ Args:
293
+ cases: List of retrieved cases
294
+
295
+ Returns:
296
+ Formatted string for prompt
297
+ """
298
+ if not cases:
299
+ return ""
300
+
301
+ prompt = "# Retrieved Similar Cases from Memory\n\n"
302
+ prompt += "The following successful cases from previous problems might be relevant:\n\n"
303
+
304
+ for i, item in enumerate(cases, 1):
305
+ case = item['case']
306
+ score = item['score']
307
+
308
+ prompt += f"## Case {i} (Similarity: {score:.3f})\n"
309
+ prompt += f"**Problem:** {case['description']}\n\n"
310
+ prompt += f"**Solution approach:**\n```python\n{case['solution_code']}\n```\n\n"
311
+ prompt += f"**Result:** Objective value = {case['objective_value']}, Status = Correct\n\n"
312
+ prompt += "---\n\n"
313
+
314
+ return prompt
315
+
316
+
src/debate_memory/memory_intelligence.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Lightweight helpers for categorising optimisation problems and surfacing
4
+ category-level memory.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import os
11
+ import re
12
+ from collections import defaultdict
13
+ from pathlib import Path
14
+ from typing import Dict, Iterable, List, Set, Tuple
15
+
16
+
17
+ _PKG_DIR = Path(__file__).resolve().parent
18
+ _PROJECT_ROOT = _PKG_DIR.parent.parent
19
+ DEFAULT_GUIDELINE_PATH = str(_PROJECT_ROOT / "memory_storage" / "category_guidelines.jsonl")
20
+
21
+
22
+ class MemoryIntelligence:
23
+ """
24
+ Heuristic problem classifier + guideline loader.
25
+
26
+ The goal is to offer fast, rule-based categorisation that can run
27
+ offline. If the heuristics fail, downstream agents (LLMs) can still
28
+ append tags, but we always return the heuristic view for consistency.
29
+ """
30
+
31
+ CATEGORY_KEYWORDS: Dict[str, Set[str]] = {
32
+ "workforce_planning": {
33
+ "worker",
34
+ "workforce",
35
+ "training",
36
+ "trainee",
37
+ "overtime",
38
+ "hire",
39
+ "fire",
40
+ },
41
+ "inventory_planning": {
42
+ "inventory",
43
+ "backlog",
44
+ "stock",
45
+ "warehouse",
46
+ "storage",
47
+ "holding cost",
48
+ },
49
+ "production_planning": {
50
+ "production",
51
+ "factory",
52
+ "capacity",
53
+ "machine",
54
+ "batch",
55
+ "demand",
56
+ },
57
+ "scheduling": {
58
+ "schedule",
59
+ "sequencing",
60
+ "precedence",
61
+ "flow shop",
62
+ "job shop",
63
+ "makespan",
64
+ },
65
+ "transportation": {
66
+ "transport",
67
+ "shipping",
68
+ "vehicle",
69
+ "route",
70
+ "delivery",
71
+ "supply",
72
+ "demand",
73
+ "shipment",
74
+ },
75
+ "network_flow": {
76
+ "flow",
77
+ "arc",
78
+ "network",
79
+ "node",
80
+ "capacity",
81
+ "supply node",
82
+ "demand node",
83
+ },
84
+ "assignment": {
85
+ "assignment",
86
+ "allocate",
87
+ "task",
88
+ "agent",
89
+ "matching",
90
+ "job",
91
+ },
92
+ "facility_location": {
93
+ "facility",
94
+ "location",
95
+ "plant",
96
+ "open",
97
+ "siting",
98
+ "distribution center",
99
+ },
100
+ "traveling_salesman": {
101
+ "tsp",
102
+ "tour",
103
+ "city",
104
+ "travel",
105
+ "route visiting",
106
+ "cyclic",
107
+ },
108
+ "portfolio_optimization": {
109
+ "portfolio",
110
+ "investment",
111
+ "asset",
112
+ "return",
113
+ "risk",
114
+ "variance",
115
+ },
116
+ }
117
+
118
+ def __init__(self, guideline_path: str = DEFAULT_GUIDELINE_PATH):
119
+ self.guideline_path = guideline_path
120
+ self.guidelines = self._load_guidelines(guideline_path)
121
+
122
+ @staticmethod
123
+ def _load_guidelines(path: str) -> Dict[str, Dict]:
124
+ guidelines: Dict[str, Dict] = {}
125
+ if not path or not os.path.exists(path):
126
+ return guidelines
127
+ with open(path, "r", encoding="utf-8") as fh:
128
+ for line in fh:
129
+ line = line.strip()
130
+ if not line:
131
+ continue
132
+ try:
133
+ payload = json.loads(line)
134
+ except json.JSONDecodeError:
135
+ continue
136
+ category = payload.get("category")
137
+ if not category:
138
+ continue
139
+ guidelines[category] = payload
140
+ return guidelines
141
+
142
+ def classify(self, description: str, top_k: int = 3, minimum_score: int = 1) -> List[Tuple[str, int]]:
143
+ """
144
+ Return a ranked list of (category, score) using keyword heuristics.
145
+ """
146
+ if not description:
147
+ return []
148
+ text = description.lower()
149
+ scores: Dict[str, int] = defaultdict(int)
150
+ for category, keywords in self.CATEGORY_KEYWORDS.items():
151
+ for keyword in keywords:
152
+ occurrences = len(re.findall(r"\b" + re.escape(keyword.lower()) + r"\b", text))
153
+ if occurrences:
154
+ scores[category] += occurrences
155
+ ranked = sorted(scores.items(), key=lambda item: item[1], reverse=True)
156
+ filtered = [(cat, score) for cat, score in ranked if score >= minimum_score]
157
+ if top_k:
158
+ return filtered[:top_k]
159
+ return filtered
160
+
161
+ def categories_only(self, description: str, top_k: int = 3, minimum_score: int = 1) -> List[str]:
162
+ return [cat for cat, _ in self.classify(description, top_k=top_k, minimum_score=minimum_score)]
163
+
164
+ def guideline_text(
165
+ self,
166
+ categories: Iterable[str],
167
+ include_header: bool = True,
168
+ max_items_per_category: int = 4,
169
+ ) -> str:
170
+ """
171
+ Render guidelines for the provided categories as a markdown string.
172
+ """
173
+ categories = list(dict.fromkeys(categories)) # deduplicate while preserving order
174
+ if not categories:
175
+ return ""
176
+
177
+ lines: List[str] = []
178
+ if include_header:
179
+ lines.append("# Category Playbook")
180
+ lines.append("")
181
+
182
+ for category in categories:
183
+ entry = self.guidelines.get(category)
184
+ if not entry:
185
+ continue
186
+ title = entry.get("title") or category.replace("_", " ").title()
187
+ lines.append(f"## {title}")
188
+ guidelines = entry.get("guidelines") or []
189
+ if not guidelines:
190
+ continue
191
+ for bullet in guidelines[:max_items_per_category]:
192
+ lines.append(f"- {bullet}")
193
+ lines.append("")
194
+
195
+ return "\n".join(lines).strip()
196
+
197
+ def guideline_bullets(self, categories: Iterable[str], max_items_per_category: int = 4) -> List[str]:
198
+ bullets: List[str] = []
199
+ for category in categories:
200
+ entry = self.guidelines.get(category)
201
+ if not entry:
202
+ continue
203
+ title = entry.get("title") or category.replace("_", " ").title()
204
+ guidelines = entry.get("guidelines") or []
205
+ for item in guidelines[:max_items_per_category]:
206
+ bullets.append(f"{title}: {item}")
207
+ return bullets
208
+
209
+
210
+ __all__ = ["MemoryIntelligence", "DEFAULT_GUIDELINE_PATH"]
src/debate_memory/run_memory_debate.py ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Run debates between two models using memory-augmented single generations.
4
+
5
+ This script automatically locates the latest initial-solution files for the
6
+ specified models, runs the parallel debate workflow from `simple_rag/debate.py`,
7
+ and then evaluates the consensus solutions with `execute.py`.
8
+
9
+ Example:
10
+ python run_memory_debate.py \
11
+ --datasets ComplexLP EasyLP \
12
+ --max_rounds 3 \
13
+ --debate_workers 16 \
14
+ --execute_workers 128
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import datetime as dt
20
+ import glob
21
+ import json
22
+ import os
23
+ import re
24
+ import subprocess
25
+ import sys
26
+ from pathlib import Path
27
+ from typing import Dict, List, Optional, Tuple
28
+
29
+ from .config import normalize_dataset_name
30
+
31
+ SCRIPT_DIR = Path(__file__).resolve().parent
32
+ SRC_DIR = SCRIPT_DIR.parent
33
+ PROJECT_ROOT = SRC_DIR.parent
34
+ MONOREPO_ROOT = PROJECT_ROOT.parent
35
+ STANDARD_RESULTS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt"
36
+ DEFAULT_RESULTS_DIR = STANDARD_RESULTS_ROOT / "generation"
37
+ DEFAULT_OUTPUT_ROOT = STANDARD_RESULTS_ROOT / "debate"
38
+ DEFAULT_DEBATE_SCRIPT = MONOREPO_ROOT / "simple_rag" / "debate.py"
39
+ DEFAULT_EXECUTE_SCRIPT = PROJECT_ROOT / "scripts" / "execute.py"
40
+ DEFAULT_DEBATE_MEMORY_DIR = PROJECT_ROOT / "debate_memory_storage"
41
+ DEBATE_MEMORY_HEADER = "# Debate Memory Insights"
42
+
43
+ from .memory_bank import MemoryBank
44
+
45
+
46
+ def format_debate_memory_context(cases: List[Dict]) -> str:
47
+ if not cases:
48
+ return ""
49
+ lines = [DEBATE_MEMORY_HEADER, ""]
50
+ for idx, item in enumerate(cases, 1):
51
+ case = item["case"]
52
+ score = item.get("score", 0.0)
53
+ metadata = case.get("metadata", {})
54
+ dataset = metadata.get("dataset", "unknown")
55
+ summary = metadata.get("summary", {}).get("summary")
56
+ lines.append(f"## Case {idx} (similarity {score:.3f}, dataset {dataset})")
57
+ description = case.get("description", "").strip()
58
+ if description:
59
+ snippet = description if len(description) <= 800 else description[:800] + "\n..."
60
+ lines.append(snippet)
61
+ if summary:
62
+ lines.append("Summary: " + summary)
63
+ lines.append("---")
64
+ return "\n".join(lines).strip()
65
+
66
+
67
+ def build_debate_memory_contexts(
68
+ files: List[str],
69
+ debate_memory: MemoryBank,
70
+ dataset: str,
71
+ top_k: int,
72
+ ) -> Dict[int, str]:
73
+ contexts: Dict[int, str] = {}
74
+ if debate_memory is None or top_k <= 0:
75
+ return contexts
76
+ for file_path in files:
77
+ with open(file_path, "r", encoding="utf-8") as fh:
78
+ for line in fh:
79
+ if not line.strip():
80
+ continue
81
+ data = json.loads(line)
82
+ problem_id = data.get("id")
83
+ if problem_id is None or problem_id in contexts:
84
+ continue
85
+ description = data.get("description", "")
86
+ if not description.strip():
87
+ contexts[problem_id] = ""
88
+ continue
89
+ cases = debate_memory.retrieve_similar_cases(
90
+ description,
91
+ top_k=top_k,
92
+ preferred_dataset=dataset,
93
+ )
94
+ contexts[problem_id] = format_debate_memory_context(cases)
95
+ return contexts
96
+
97
+
98
+ def maybe_enrich_generation_file(
99
+ source_path: str,
100
+ destination_path: str,
101
+ contexts: Dict[int, str],
102
+ ) -> str:
103
+ if not contexts:
104
+ return source_path
105
+ changed = False
106
+ enriched_lines: List[str] = []
107
+ with open(source_path, "r", encoding="utf-8") as fh:
108
+ for line in fh:
109
+ if not line.strip():
110
+ continue
111
+ data = json.loads(line)
112
+ pid = data.get("id")
113
+ context = contexts.get(pid)
114
+ if context:
115
+ data["description"] = f"{data.get('description', '').strip()}\n\n{context}"
116
+ changed = True
117
+ enriched_lines.append(json.dumps(data, ensure_ascii=False))
118
+ if not changed:
119
+ return source_path
120
+ with open(destination_path, "w", encoding="utf-8") as fh:
121
+ for entry in enriched_lines:
122
+ fh.write(entry + "\n")
123
+ return destination_path
124
+
125
+
126
+ def parse_args() -> argparse.Namespace:
127
+ parser = argparse.ArgumentParser(
128
+ description="Parallel debate runner for memory-enhanced single generations"
129
+ )
130
+ parser.add_argument(
131
+ "--modelA",
132
+ type=str,
133
+ default="gpt-4o",
134
+ help="First model in the debate (default: gpt-4o)",
135
+ )
136
+ parser.add_argument(
137
+ "--modelB",
138
+ type=str,
139
+ default="deepseek-chat",
140
+ help="Second model in the debate (default: deepseek-chat)",
141
+ )
142
+ parser.add_argument(
143
+ "--results_dir",
144
+ type=str,
145
+ default=str(DEFAULT_RESULTS_DIR),
146
+ help="Directory that stores initial-solution JSONL files",
147
+ )
148
+ parser.add_argument(
149
+ "--datasets",
150
+ nargs="*",
151
+ default=None,
152
+ help="Datasets to debate. If omitted, auto-detect common datasets.",
153
+ )
154
+ parser.add_argument(
155
+ "--output_root",
156
+ type=str,
157
+ default=str(DEFAULT_OUTPUT_ROOT),
158
+ help="Root directory to store debate/eval outputs",
159
+ )
160
+ parser.add_argument(
161
+ "--debate_script",
162
+ type=str,
163
+ default=str(DEFAULT_DEBATE_SCRIPT),
164
+ help="Path to simple_rag/debate.py (override if needed)",
165
+ )
166
+ parser.add_argument(
167
+ "--execute_script",
168
+ type=str,
169
+ default=str(DEFAULT_EXECUTE_SCRIPT),
170
+ help="Path to debate_with_memory/execute.py (override if needed)",
171
+ )
172
+ parser.add_argument(
173
+ "--max_rounds",
174
+ type=int,
175
+ default=3,
176
+ help="Maximum number of debate rounds (default: 3)",
177
+ )
178
+ parser.add_argument(
179
+ "--temperature",
180
+ type=float,
181
+ default=0.01,
182
+ help="Temperature for debate LLM calls (default: 0.01)",
183
+ )
184
+ parser.add_argument(
185
+ "--debate_workers",
186
+ type=int,
187
+ default=16,
188
+ help="Parallel workers for debate (ThreadPool inside debate.py)",
189
+ )
190
+ parser.add_argument(
191
+ "--execute_workers",
192
+ type=int,
193
+ default=128,
194
+ help="Parallel workers for execute.py evaluation",
195
+ )
196
+ parser.add_argument(
197
+ "--max_problems",
198
+ type=int,
199
+ default=None,
200
+ help="Optional cap on number of problems per dataset",
201
+ )
202
+ parser.add_argument(
203
+ "--tolerance",
204
+ type=float,
205
+ default=0.05,
206
+ help="Relative tolerance for evaluation accuracy comparison",
207
+ )
208
+ parser.add_argument(
209
+ "--timeout",
210
+ type=int,
211
+ default=90,
212
+ help="Timeout (seconds) for executing consensus code",
213
+ )
214
+ parser.add_argument(
215
+ "--relative_tolerance",
216
+ action="store_true",
217
+ help="Pass --use_relative_tolerance to execute.py",
218
+ )
219
+ parser.add_argument(
220
+ "--save_execution_stdout",
221
+ action="store_true",
222
+ help="Store stdout/stderr for consensus executions",
223
+ )
224
+ parser.add_argument(
225
+ "--execute_memory_dir",
226
+ type=str,
227
+ default=None,
228
+ help="Optional memory_storage directory forwarded to execute.py during consensus evaluation.",
229
+ )
230
+ parser.add_argument(
231
+ "--execute_debug_memory_path",
232
+ type=str,
233
+ default=None,
234
+ help="Optional debug_memory.jsonl path forwarded to execute.py during consensus evaluation.",
235
+ )
236
+ parser.add_argument(
237
+ "--execute_disable_debug_memory",
238
+ action="store_true",
239
+ help="Pass --disable_debug_memory to execute.py during consensus evaluation.",
240
+ )
241
+ parser.add_argument(
242
+ "--dry_run",
243
+ action="store_true",
244
+ help="Only print the planned actions without running debate/eval",
245
+ )
246
+ parser.add_argument(
247
+ "--debate_memory_dir",
248
+ type=str,
249
+ default=str(DEFAULT_DEBATE_MEMORY_DIR),
250
+ help="Directory containing debate memory cases for prompt augmentation",
251
+ )
252
+ parser.add_argument(
253
+ "--debate_memory_top_k",
254
+ type=int,
255
+ default=2,
256
+ help="How many debate memory cases to retrieve per problem",
257
+ )
258
+ parser.add_argument(
259
+ "--disable_debate_memory",
260
+ action="store_true",
261
+ help="Skip retrieval even if debate memory directory exists",
262
+ )
263
+ parser.add_argument(
264
+ "--embedding_model",
265
+ type=str,
266
+ default=None,
267
+ help="Embedding model name or local path (default: BAAI/bge-small-en-v1.5). "
268
+ "Use local path to avoid Hugging Face downloads, or set HF_HUB_OFFLINE=1 environment variable.",
269
+ )
270
+ return parser.parse_args()
271
+
272
+
273
+ def normalize_dataset_list(raw_list: Optional[List[str]]) -> Optional[List[str]]:
274
+ """Split comma-separated values and strip whitespace."""
275
+ if not raw_list:
276
+ return None
277
+ datasets: List[str] = []
278
+ for item in raw_list:
279
+ parts = [part.strip() for part in item.split(",") if part.strip()]
280
+ datasets.extend(normalize_dataset_name(part) for part in parts)
281
+ return datasets or None
282
+
283
+
284
+ def collect_runs(results_dir: str, model: str) -> Dict[str, List[Tuple[str, str]]]:
285
+ """
286
+ Return mapping dataset -> list of (timestamp, path) sorted ascending.
287
+ Skips evaluation artifacts (suffixes containing '_eval').
288
+ """
289
+ pattern = os.path.join(results_dir, f"{model}_*.jsonl")
290
+ regex = re.compile(rf"{re.escape(model)}_(.+)_(\d{{8}}_\d{{6}})\.jsonl$")
291
+ runs: Dict[str, List[Tuple[str, str]]] = {}
292
+
293
+ for path in glob.glob(pattern):
294
+ base = os.path.basename(path)
295
+ match = regex.match(base)
296
+ if not match:
297
+ continue
298
+ dataset = normalize_dataset_name(match.group(1))
299
+ if "_eval" in dataset:
300
+ continue
301
+ timestamp = match.group(2)
302
+ runs.setdefault(dataset, []).append((timestamp, path))
303
+
304
+ for dataset in runs:
305
+ runs[dataset].sort() # chronological
306
+
307
+ return runs
308
+
309
+
310
+ def pick_latest(runs: Dict[str, List[Tuple[str, str]]], dataset: str) -> Optional[str]:
311
+ """Return latest file path for dataset if available."""
312
+ entries = runs.get(dataset)
313
+ if not entries:
314
+ return None
315
+ return entries[-1][1]
316
+
317
+
318
+ def stream_command(cmd: List[str], cwd: str, log_path: str) -> None:
319
+ """Run a subprocess, streaming output to stdout and a log file."""
320
+ print(f"\n▶ Running: {' '.join(cmd)}", flush=True)
321
+ print(f" cwd: {cwd}", flush=True)
322
+ os.makedirs(os.path.dirname(log_path), exist_ok=True)
323
+
324
+ with open(log_path, "w", encoding="utf-8") as log_file:
325
+ process = subprocess.Popen(
326
+ cmd,
327
+ cwd=cwd,
328
+ stdout=subprocess.PIPE,
329
+ stderr=subprocess.STDOUT,
330
+ text=True,
331
+ encoding="utf-8",
332
+ errors="replace",
333
+ bufsize=1,
334
+ )
335
+ assert process.stdout is not None # for type checkers
336
+ for line in process.stdout:
337
+ print(line, end="", flush=True)
338
+ log_file.write(line)
339
+ log_file.flush()
340
+ return_code = process.wait()
341
+
342
+ if return_code != 0:
343
+ raise subprocess.CalledProcessError(return_code, cmd)
344
+
345
+
346
+ def load_eval_report(report_path: str) -> Optional[Dict]:
347
+ if not os.path.exists(report_path):
348
+ return None
349
+ with open(report_path, "r", encoding="utf-8") as fh:
350
+ return json.load(fh)
351
+
352
+
353
+ def ensure_script(path: str, description: str) -> None:
354
+ if not os.path.isfile(path):
355
+ raise FileNotFoundError(f"{description} not found: {path}")
356
+
357
+
358
+ def main() -> None:
359
+ args = parse_args()
360
+ args.datasets = normalize_dataset_list(args.datasets)
361
+ args.output_root = os.path.abspath(args.output_root)
362
+ args.results_dir = os.path.abspath(args.results_dir)
363
+
364
+ debate_memory_bank: Optional[MemoryBank] = None
365
+ if not args.disable_debate_memory and args.debate_memory_dir:
366
+ debate_memory_path = Path(args.debate_memory_dir)
367
+ if debate_memory_path.exists():
368
+ try:
369
+ embedding_model = args.embedding_model if args.embedding_model else "BAAI/bge-small-en-v1.5"
370
+ debate_memory_bank = MemoryBank(
371
+ memory_dir=str(debate_memory_path),
372
+ embedding_model=embedding_model
373
+ )
374
+ except Exception as exc: # noqa: BLE001
375
+ print(f"⚠️ Warning: failed to load debate memory from {debate_memory_path}: {exc}")
376
+ else:
377
+ print(f"ℹ️ Debate memory directory not found: {debate_memory_path} (skipping context retrieval)")
378
+
379
+ ensure_script(args.debate_script, "Debate script")
380
+ ensure_script(args.execute_script, "Execute script")
381
+
382
+ modelA_runs = collect_runs(args.results_dir, args.modelA)
383
+ modelB_runs = collect_runs(args.results_dir, args.modelB)
384
+
385
+ if args.datasets:
386
+ datasets = args.datasets
387
+ else:
388
+ datasets = sorted(set(modelA_runs.keys()) & set(modelB_runs.keys()))
389
+
390
+ if not datasets:
391
+ print("❌ No common datasets with available runs were found.")
392
+ sys.exit(1)
393
+
394
+ print("=" * 80)
395
+ print("🧠 Memory-Based Debate Runner")
396
+ print("=" * 80)
397
+ print(f"Model A: {args.modelA}")
398
+ print(f"Model B: {args.modelB}")
399
+ print(f"Datasets: {', '.join(datasets)}")
400
+ print(f"Results dir: {args.results_dir}")
401
+ print(f"Output root: {args.output_root}")
402
+ print(f"Debate workers: {args.debate_workers} (parallel)")
403
+ print("=" * 80)
404
+
405
+ timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
406
+ dataset_summaries: List[Dict] = []
407
+ processed = 0
408
+
409
+ for dataset in datasets:
410
+ file_a = pick_latest(modelA_runs, dataset)
411
+ file_b = pick_latest(modelB_runs, dataset)
412
+
413
+ if not file_a or not file_b:
414
+ print(f"⚠️ Skipping {dataset}: missing runs for one of the models.")
415
+ dataset_summaries.append(
416
+ {
417
+ "dataset": dataset,
418
+ "status": "missing_runs",
419
+ "modelA_file": file_a,
420
+ "modelB_file": file_b,
421
+ }
422
+ )
423
+ continue
424
+
425
+ run_dir = os.path.join(
426
+ args.output_root,
427
+ dataset,
428
+ f"{timestamp}_{args.modelA}_vs_{args.modelB}",
429
+ )
430
+ os.makedirs(run_dir, exist_ok=True)
431
+
432
+ print(f"\n{'=' * 80}")
433
+ print(f"🚀 Dataset: {dataset}")
434
+ print(f" Model A file: {file_a}")
435
+ print(f" Model B file: {file_b}")
436
+ print(f" Output dir: {run_dir}")
437
+ print(f"{'=' * 80}")
438
+
439
+ file_a_for_debate = file_a
440
+ file_b_for_debate = file_b
441
+ if not args.dry_run and debate_memory_bank and args.debate_memory_top_k > 0:
442
+ contexts = build_debate_memory_contexts(
443
+ [file_a, file_b], debate_memory_bank, dataset, args.debate_memory_top_k
444
+ )
445
+ if any(contexts.values()):
446
+ print(" 🧠 Injecting debate memory context into prompts")
447
+ enriched_a = os.path.join(
448
+ run_dir, f"{os.path.basename(file_a)}.debate_memory.jsonl"
449
+ )
450
+ enriched_b = os.path.join(
451
+ run_dir, f"{os.path.basename(file_b)}.debate_memory.jsonl"
452
+ )
453
+ file_a_for_debate = maybe_enrich_generation_file(file_a, enriched_a, contexts)
454
+ file_b_for_debate = maybe_enrich_generation_file(file_b, enriched_b, contexts)
455
+
456
+ if args.dry_run:
457
+ print("Dry-run mode → skipping actual execution.")
458
+ dataset_summaries.append(
459
+ {
460
+ "dataset": dataset,
461
+ "status": "dry_run",
462
+ "debate_dir": run_dir,
463
+ "modelA_file": file_a,
464
+ "modelB_file": file_b,
465
+ }
466
+ )
467
+ continue
468
+
469
+ # 1) Run debate
470
+ debate_cmd = [
471
+ sys.executable,
472
+ "-u",
473
+ args.debate_script,
474
+ "--resultA",
475
+ file_a_for_debate,
476
+ "--resultB",
477
+ file_b_for_debate,
478
+ "--modelA",
479
+ args.modelA,
480
+ "--modelB",
481
+ args.modelB,
482
+ "--save_dir",
483
+ run_dir,
484
+ "--max_rounds",
485
+ str(args.max_rounds),
486
+ "--temperature",
487
+ str(args.temperature),
488
+ "--num_workers",
489
+ str(args.debate_workers),
490
+ ]
491
+ if args.max_problems is not None:
492
+ debate_cmd.extend(["--max_problems", str(args.max_problems)])
493
+
494
+ debate_log = os.path.join(run_dir, "debate.log")
495
+ stream_command(debate_cmd, cwd=str(MONOREPO_ROOT), log_path=debate_log)
496
+
497
+ consensus_file = os.path.join(
498
+ run_dir, f"consensus_{args.modelA}_vs_{args.modelB}.jsonl"
499
+ )
500
+ if not os.path.exists(consensus_file):
501
+ raise FileNotFoundError(
502
+ f"Consensus file not found after debate: {consensus_file}"
503
+ )
504
+
505
+ # 2) Evaluate consensus
506
+ eval_dir = os.path.join(run_dir, "eval_consensus")
507
+ eval_cmd = [
508
+ sys.executable,
509
+ "-u",
510
+ args.execute_script,
511
+ "--input_file",
512
+ consensus_file,
513
+ "--output_dir",
514
+ eval_dir,
515
+ "--timeout",
516
+ str(args.timeout),
517
+ "--tolerance",
518
+ str(args.tolerance),
519
+ "--num_workers",
520
+ str(args.execute_workers),
521
+ ]
522
+ if args.relative_tolerance:
523
+ eval_cmd.append("--use_relative_tolerance")
524
+ if args.save_execution_stdout:
525
+ eval_cmd.append("--save_output")
526
+ if args.execute_memory_dir:
527
+ eval_cmd.extend(["--memory_dir", args.execute_memory_dir])
528
+ if args.execute_debug_memory_path:
529
+ eval_cmd.extend(["--debug_memory_path", args.execute_debug_memory_path])
530
+ if args.execute_disable_debug_memory:
531
+ eval_cmd.append("--disable_debug_memory")
532
+ if args.embedding_model:
533
+ eval_cmd.extend(["--embedding_model", args.embedding_model])
534
+
535
+ eval_log = os.path.join(run_dir, "evaluate.log")
536
+ stream_command(eval_cmd, cwd=str(PROJECT_ROOT), log_path=eval_log)
537
+
538
+ report_path = os.path.join(eval_dir, "evaluation_report.json")
539
+ report = load_eval_report(report_path)
540
+ if report is None:
541
+ raise FileNotFoundError(f"Missing evaluation report: {report_path}")
542
+
543
+ dataset_summaries.append(
544
+ {
545
+ "dataset": dataset,
546
+ "status": "completed",
547
+ "debate_dir": run_dir,
548
+ "accuracy": report.get("accuracy"),
549
+ "correct": report.get("correct"),
550
+ "total": report.get("total_problems"),
551
+ "report_path": report_path,
552
+ }
553
+ )
554
+ processed += 1
555
+
556
+ print("\n" + "=" * 80)
557
+ print("📊 Debate + Evaluation Summary")
558
+ print("=" * 80)
559
+ for item in dataset_summaries:
560
+ dataset = item["dataset"]
561
+ status = item["status"]
562
+ if status == "completed":
563
+ accuracy = item.get("accuracy")
564
+ correct = item.get("correct")
565
+ total = item.get("total")
566
+ print(
567
+ f"{dataset:25s} → accuracy {accuracy:.2%} ({correct}/{total}) | dir: {item['debate_dir']}"
568
+ )
569
+ elif status == "dry_run":
570
+ print(f"{dataset:25s} → dry run (planned dir: {item['debate_dir']})")
571
+ else:
572
+ print(f"{dataset:25s} → {status} (A={item.get('modelA_file')}, B={item.get('modelB_file')})")
573
+
574
+ print("=" * 80)
575
+ if not args.dry_run and processed == 0:
576
+ sys.exit("No datasets were processed successfully.")
577
+
578
+
579
+ if __name__ == "__main__":
580
+ main()