SorrowTea commited on 13 days ago

Commit

96abbd8

verified ·

1 Parent(s): 336f1b6

Upload 45 files

Browse files

Files changed (45) hide show

README.md +215 -3
scripts/__pycache__/run_ablation_suite.cpython-311.pyc +0 -0
scripts/augment_memory_from_standalone_runs.py +16 -0
scripts/build_debate_memory.py +17 -0
scripts/build_debug_memory.py +17 -0
scripts/build_memory_assets.sh +56 -0
scripts/build_memory_from_eval_results.py +17 -0
scripts/execute.py +18 -0
scripts/generate_with_memory.py +17 -0
scripts/process_all_debate_cases.sh +64 -0
scripts/run_ablation_suite.py +403 -0
scripts/run_generate_and_evaluate.sh +640 -0
scripts/run_memory_debate.py +17 -0
scripts/test_self_healing_full.sh +92 -0
src/debate_memory/__init__.py +11 -0
src/debate_memory/__pycache__/__init__.cpython-310.pyc +0 -0
src/debate_memory/__pycache__/__init__.cpython-311.pyc +0 -0
src/debate_memory/__pycache__/build_memory_from_eval_results.cpython-311.pyc +0 -0
src/debate_memory/__pycache__/config.cpython-310.pyc +0 -0
src/debate_memory/__pycache__/config.cpython-311.pyc +0 -0
src/debate_memory/__pycache__/debate_memory_builder.cpython-311.pyc +0 -0
src/debate_memory/__pycache__/debug_executor.cpython-310.pyc +0 -0
src/debate_memory/__pycache__/debug_memory.cpython-310.pyc +0 -0
src/debate_memory/__pycache__/debug_memory_builder.cpython-311.pyc +0 -0
src/debate_memory/__pycache__/generate_with_memory.cpython-310.pyc +0 -0
src/debate_memory/__pycache__/generate_with_memory.cpython-311.pyc +0 -0
src/debate_memory/__pycache__/llm.cpython-310.pyc +0 -0
src/debate_memory/__pycache__/llm.cpython-311.pyc +0 -0
src/debate_memory/__pycache__/memory_bank.cpython-310.pyc +0 -0
src/debate_memory/__pycache__/memory_bank.cpython-311.pyc +0 -0
src/debate_memory/__pycache__/run_memory_debate.cpython-311.pyc +0 -0
src/debate_memory/augment_memory_from_standalone_runs.py +974 -0
src/debate_memory/build_memory_from_eval_results.py +293 -0
src/debate_memory/config.py +189 -0
src/debate_memory/debate_memory_builder.py +477 -0
src/debate_memory/debug_executor.py +136 -0
src/debate_memory/debug_memory.py +163 -0
src/debate_memory/debug_memory_builder.py +150 -0
src/debate_memory/debug_utils.py +99 -0
src/debate_memory/execute.py +522 -0
src/debate_memory/generate_with_memory.py +920 -0
src/debate_memory/llm.py +111 -0
src/debate_memory/memory_bank.py +316 -0
src/debate_memory/memory_intelligence.py +210 -0
src/debate_memory/run_memory_debate.py +580 -0

README.md CHANGED Viewed

@@ -1,3 +1,215 @@
----
-license: apache-2.0
----

+# Agora-Opt Code Package
+## What This Directory Contains
+`./code/Agora-Opt/` is the source directory for the Agora-Opt method. It
+retains two categories of assets:
+- the Agora-Opt implementation
+- prebuilt memory assets used by the method
+Historical run outputs are not stored here.
+For compatibility with the original stage naming, the main reproduction script
+maintains two convenience paths:
+- `generated_with_memory`
+- `debate_runs`
+## Important Subdirectories
+The most important components are:
+- `src/debate_memory/`: core Agora-Opt implementation
+- `scripts/`: command-line wrappers
+- `memory_storage/`: solution memory
+- `debug_case_memory/`: debug memory retrieval bank
+- `debate_memory_storage/`: debate memory retrieval bank
+- `memory_variants/`: retained alternative memory variants
+- `memory_backups/`: retained memory backups
+Multiple memory versions are intentionally kept. They were prepared during
+different stages of the project and can all be treated as available assets for
+generation, debugging, and debate.
+## Core Workflow
+Agora-Opt runs in two stages.
+### Stage 1: Generate Initial Solutions
+`generate_with_memory.py` generates candidate solutions, optionally using
+solution memory and debug memory.
+Primary entry script:
+- `scripts/generate_with_memory.py`
+This stage:
+- reads benchmark problems
+- retrieves similar solved cases from `memory_storage/`
+- generates candidate modeling code
+- uses debug memory during self-repair when execution fails
+### Stage 2: Run Debate
+`run_memory_debate.py` takes two sets of initial solutions and runs the
+decentralized debate stage.
+Primary entry script:
+- `scripts/run_memory_debate.py`
+This stage:
+- loads both sides' initial solutions
+- retrieves historical debate cases from `debate_memory_storage/`
+- performs iterative comparison, revision, and convergence
+- executes and evaluates the final consensus solution
+## Memory Types
+### 1. Solution Memory
+Directory:
+- `memory_storage/`
+Purpose:
+- retrieves similar successful modeling cases during generation
+- supplies formulation templates and structural priors
+Build path:
+- extract `(problem description, correct code, objective value)` from correctly
+  evaluated runs
+- build `cases.jsonl` plus its retrieval index
+Related script:
+- `scripts/build_memory_from_eval_results.py`
+### 2. Debug Memory
+Directory:
+- `debug_case_memory/`
+Purpose:
+- retrieves similar execution failures and repair experience
+- supports automatic self-debugging during generation
+Build path:
+- extract unique error signatures from `debug_memory.jsonl` and its backups
+- normalize the error text, repair hints, and metadata into a retrieval bank
+Related script:
+- `scripts/build_debug_memory.py`
+Note:
+- raw debug logs are stored in `memory_storage/debug_memory.jsonl`
+- that log file is one of the inputs used to build debug memory
+### 3. Debate Memory
+Directory:
+- `debate_memory_storage/`
+Purpose:
+- stores examples of how disagreements were resolved during debate
+- helps later debates converge more efficiently
+Build path:
+- select historical runs where the two initial solutions disagreed
+- keep cases where debate eventually converged successfully
+- extract the dispute, key arguments, and final converged code
+Related scripts:
+- `scripts/build_debate_memory.py`
+- `scripts/process_all_debate_cases.sh`
+## Suggested Memory Construction Order
+When preparing memory from scratch, the recommended order is:
+1. run generation and evaluation to obtain `evaluation_results`
+2. build solution memory from correct cases
+3. build debug memory from accumulated `debug_memory.jsonl`
+4. build debate memory from historical debate runs
+The dependency flow is:
+- `evaluation_results` -> `solution memory`
+- `debug_memory.jsonl` -> `debug memory`
+- debate run artifacts -> `debate memory`
+## Retained Memory Assets
+This directory intentionally keeps:
+- the three primary memory stores
+- memory variants
+- memory backups
+These are treated as static method assets.
+Historical run outputs are not retained here, which keeps source code, memory
+assets, and new results clearly separated.
+To rebuild the three memory types, use:
+```bash
+bash ./code/Agora-Opt/scripts/build_memory_assets.sh /path/to/eval_dir1 /path/to/eval_dir2
+```
+That script attempts to:
+- rebuild solution memory from evaluation directories
+- rebuild debug memory from `debug_memory.jsonl` and its backups
+- rebuild debate memory from debate run artifacts
+## Recommended Entry Points
+For paper reproduction, use the outer scripts rather than manually assembling
+commands in this directory:
+- main table: `./code/scripts/run_agora.sh`
+- 5.1: `./code/experiments/5.1_compatibility_backbone_llms/`
+- 5.2: `./code/experiments/5.2_ablation_study/`
+- 5.3.1: `./code/experiments/5.3.1_centralized_judge_selection/`
+- 5.3.2: `./code/experiments/5.3.2_impact_of_debate_rounds/`
+- 5.3.3:
+  `./code/experiments/5.3.3_generalization_of_decentralized_debate_protocol/`
+## Direct Source-Level Usage
+For direct method-level use, the main wrappers are:
+```bash
+python scripts/generate_with_memory.py
+python scripts/run_memory_debate.py
+python scripts/execute.py
+python scripts/build_memory_from_eval_results.py
+python scripts/build_debug_memory.py
+python scripts/build_debate_memory.py
+```
+## Path Conventions
+Within the open-source package, the intended layout is:
+- benchmark data: `./data/benchmarks/`
+- Agora-Opt source code and memory: `./code/Agora-Opt/`
+This separation makes the boundaries between code, memory assets, and newly
+generated outputs explicit.

scripts/__pycache__/run_ablation_suite.cpython-311.pyc ADDED Viewed

Binary file (19.3 kB). View file

scripts/augment_memory_from_standalone_runs.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python3
+"""Wrapper for debate_memory.augment_memory_from_standalone_runs."""
+from pathlib import Path
+import sys
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from debate_memory.augment_memory_from_standalone_runs import main
+if __name__ == "__main__":
+    main()

scripts/build_debate_memory.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env python3
+"""Wrapper for debate_memory.debate_memory_builder."""
+from pathlib import Path
+import sys
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from debate_memory.debate_memory_builder import main
+if __name__ == "__main__":
+    main()

scripts/build_debug_memory.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env python3
+"""Wrapper to consolidate debug_memory.jsonl entries into a memory bank."""
+from pathlib import Path
+import sys
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from debate_memory.debug_memory_builder import main
+if __name__ == "__main__":
+    main()

scripts/build_memory_assets.sh ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/bin/bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+AGORA_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+OPEN_ROOT="$(cd "${AGORA_DIR}/../.." && pwd)"
+RESULTS_ROOT="${OPEN_ROOT}/results"
+BENCHMARK_DIR="${OPEN_ROOT}/data/benchmarks"
+PYTHON_BIN="${PYTHON_BIN:-python3}"
+SOLUTION_MEMORY_DIR="${SOLUTION_MEMORY_DIR:-${AGORA_DIR}/memory_storage}"
+DEBUG_CASE_MEMORY_DIR="${DEBUG_CASE_MEMORY_DIR:-${AGORA_DIR}/debug_case_memory}"
+DEBATE_MEMORY_DIR="${DEBATE_MEMORY_DIR:-${AGORA_DIR}/debate_memory_storage}"
+DEBATE_RUNS_ROOT="${DEBATE_RUNS_ROOT:-${RESULTS_ROOT}/Agora-Opt/debate}"
+export PYTHONPATH="${AGORA_DIR}/src:${PYTHONPATH:-}"
+echo "============================================================"
+echo "Agora-Opt Memory Builder"
+echo "============================================================"
+echo "Solution memory: ${SOLUTION_MEMORY_DIR}"
+echo "Debug memory:    ${DEBUG_CASE_MEMORY_DIR}"
+echo "Debate memory:   ${DEBATE_MEMORY_DIR}"
+echo "Debate runs:     ${DEBATE_RUNS_ROOT}"
+echo "============================================================"
+echo
+if [[ "$#" -gt 0 ]]; then
+    echo "Building solution memory from evaluation directories..."
+    "${PYTHON_BIN}" "${SCRIPT_DIR}/build_memory_from_eval_results.py" \
+        --eval_dirs "$@" \
+        --benchmarks_dir "${BENCHMARK_DIR}" \
+        --memory_dir "${SOLUTION_MEMORY_DIR}"
+    echo
+else
+    echo "Skipping solution memory rebuild because no evaluation directories were provided."
+    echo "Usage example:"
+    echo "  bash ./code/Agora-Opt/scripts/build_memory_assets.sh /path/to/eval_dir1 /path/to/eval_dir2"
+    echo
+fi
+echo "Building debug memory..."
+"${PYTHON_BIN}" "${SCRIPT_DIR}/build_debug_memory.py" \
+    --output_dir "${DEBUG_CASE_MEMORY_DIR}"
+echo
+if [[ -d "${DEBATE_RUNS_ROOT}" ]]; then
+    echo "Building debate memory..."
+    "${PYTHON_BIN}" "${SCRIPT_DIR}/build_debate_memory.py" \
+        --runs_root "${DEBATE_RUNS_ROOT}" \
+        --output_dir "${DEBATE_MEMORY_DIR}"
+else
+    echo "Skipping debate memory rebuild because debate runs root does not exist:"
+    echo "  ${DEBATE_RUNS_ROOT}"
+fi

scripts/build_memory_from_eval_results.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env python3
+"""Wrapper for debate_memory.build_memory_from_eval_results."""
+from pathlib import Path
+import sys
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from debate_memory.build_memory_from_eval_results import main
+if __name__ == "__main__":
+    main()

scripts/execute.py ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/usr/bin/env python3
+"""Wrapper to run debate_memory.execute with package imports resolved."""
+from pathlib import Path
+import sys
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from debate_memory.execute import parse_args, main
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

scripts/generate_with_memory.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env python3
+"""Wrapper to run debate_memory.generate_with_memory as a script."""
+from pathlib import Path
+import sys
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from debate_memory.generate_with_memory import main
+if __name__ == "__main__":
+    main()

scripts/process_all_debate_cases.sh ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/bin/bash
+# Batch process every historical debate run and refresh the debate memory bank.
+#
+# Usage:
+#   ./scripts/process_all_debate_cases.sh [runs_root] [output_dir]
+# Example:
+#   ./scripts/process_all_debate_cases.sh \
+#       ../../results/Agora-Opt/debate \
+#       debate_memory_storage
+#
+# Environment variables (optional):
+#   LLM_MODEL        - override default gpt-4o summarizer
+#   LLM_ATTEMPTS     - retries per case (default 2)
+#   MAX_WORKERS      - thread pool size (default 64)
+#   PYTHON_BIN       - python executable (default python)
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+DEFAULT_RUNS_ROOT="${PROJECT_ROOT}/../../results/Agora-Opt/debate"
+RUNS_ROOT="${1:-$DEFAULT_RUNS_ROOT}"
+OUTPUT_DIR="${2:-${PROJECT_ROOT}/debate_memory_storage}"
+LLM_MODEL="${LLM_MODEL:-gpt-4o}"
+LLM_ATTEMPTS="${LLM_ATTEMPTS:-2}"
+MAX_WORKERS="${MAX_WORKERS:-64}"
+PYTHON_BIN="${PYTHON_BIN:-python}"
+echo "============================================================"
+echo "🧠 Building Debate Memory"
+echo "============================================================"
+echo "Runs root:       ${RUNS_ROOT}"
+echo "Output dir:      ${OUTPUT_DIR}"
+echo "LLM model:       ${LLM_MODEL:-<heuristic>}"
+echo "LLM attempts:    ${LLM_ATTEMPTS}"
+echo "Max workers:     ${MAX_WORKERS}"
+echo "Python binary:   ${PYTHON_BIN}"
+echo "============================================================"
+echo
+CMD=(
+  "${PYTHON_BIN}"
+  "${PROJECT_ROOT}/scripts/build_debate_memory.py"
+  "--runs_root" "${RUNS_ROOT}"
+  "--output_dir" "${OUTPUT_DIR}"
+  "--max_workers" "${MAX_WORKERS}"
+  "--llm_attempts" "${LLM_ATTEMPTS}"
+)
+if [ -n "${LLM_MODEL}" ]; then
+  CMD+=("--llm_model" "${LLM_MODEL}")
+fi
+echo "Running: ${CMD[*]}"
+echo
+"${CMD[@]}"
+echo
+echo "✅ Debate memory refreshed."
+echo "Cases stored in: ${OUTPUT_DIR}"

scripts/run_ablation_suite.py ADDED Viewed

	@@ -0,0 +1,403 @@

+#!/usr/bin/env python3
+"""
+Run a suite of ablation experiments (generation + evaluation) and summarise results.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import shlex
+import subprocess
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Sequence, Tuple
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+STANDARD_RESULTS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt"
+GENERATE_SCRIPT = PROJECT_ROOT / "scripts" / "generate_with_memory.py"
+EXECUTE_SCRIPT = PROJECT_ROOT / "scripts" / "execute.py"
+PYTHON_BIN = os.environ.get("PYTHON_BIN", sys.executable)
+@dataclass
+class Variant:
+    name: str
+    description: str
+    overrides: Dict[str, object]
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run generate+evaluate ablations and emit a summary table."
+    )
+    parser.add_argument("--model", type=str, default="gpt-4o", help="LLM to query.")
+    parser.add_argument(
+        "--datasets",
+        nargs="+",
+        default=["IndustryOR", "ComplexLP"],
+        help="Datasets to evaluate (space-separated, omit .jsonl).",
+    )
+    parser.add_argument("--temperature", type=float, default=0.01)
+    parser.add_argument(
+        "--max_problems",
+        type=int,
+        default=None,
+        help="Limit number of problems per dataset (omit for full set).",
+    )
+    parser.add_argument("--memory_dir", type=str, default="memory_storage")
+    parser.add_argument(
+        "--memory_top_k",
+        type=int,
+        default=3,
+        help="Base episodic memory retrieval count for the full variant.",
+    )
+    parser.add_argument(
+        "--max_retries",
+        type=int,
+        default=5,
+        help="Base retry budget for the full variant.",
+    )
+    parser.add_argument(
+        "--debug_case_top_k",
+        type=int,
+        default=3,
+        help="Base debug-case retrieval count.",
+    )
+    parser.add_argument(
+        "--parallel",
+        type=int,
+        default=64,
+        help="Workers for generation (passed to --parallel).",
+    )
+    parser.add_argument(
+        "--execution_timeout",
+        type=int,
+        default=90,
+        help="Timeout per execution attempt in generate_with_memory.",
+    )
+    parser.add_argument(
+        "--debug_memory_path",
+        type=str,
+        default="memory_storage/debug_memory.jsonl",
+        help="Path to debug memory JSONL.",
+    )
+    parser.add_argument(
+        "--debug_case_dir",
+        type=str,
+        default="debug_case_memory",
+        help="Directory containing consolidated debug-case memory.",
+    )
+    parser.add_argument(
+        "--output_root",
+        type=str,
+        default=str(STANDARD_RESULTS_ROOT / "ablations"),
+        help="Root folder for storing ablation artefacts.",
+    )
+    parser.add_argument(
+        "--eval_timeout",
+        type=int,
+        default=90,
+        help="Timeout for scripts/execute.py.",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=64,
+        help="ProcessPool workers for evaluation.",
+    )
+    parser.add_argument("--tolerance", type=float, default=0.05)
+    parser.add_argument(
+        "--relative_tolerance",
+        action="store_true",
+        help="Use relative tolerance in evaluation.",
+    )
+    parser.add_argument(
+        "--dry_run",
+        action="store_true",
+        help="Print commands without executing or aggregating results.",
+    )
+    return parser.parse_args()
+def build_variants(args: argparse.Namespace) -> List[Variant]:
+    base = {
+        "memory_top_k": args.memory_top_k,
+        "use_llm_refinement": True,
+        "debug_case_memory_top_k": args.debug_case_top_k,
+        "max_retries": args.max_retries,
+        "auto_debug": True,
+    }
+    return [
+        Variant(
+            name="full_system",
+            description="All helpers enabled (reference).",
+            overrides={**base},
+        ),
+        Variant(
+            name="no_llm_refine",
+            description="Skip LLM summarisation of retrieved cases.",
+            overrides={**base, "use_llm_refinement": False},
+        ),
+        Variant(
+            name="no_debug_case_memory",
+            description="Disable historical debug-case retrieval.",
+            overrides={**base, "debug_case_memory_top_k": 0},
+        ),
+        Variant(
+            name="no_self_healing",
+            description="Single attempt (max_retries=1) but still executes locally once.",
+            overrides={**base, "max_retries": 1},
+        ),
+        Variant(
+            name="no_memory",
+            description="Disable episodic retrieval, keep retries on.",
+            overrides={**base, "memory_top_k": 0, "use_llm_refinement": False},
+        ),
+        Variant(
+            name="vanilla_llm",
+            description="Pure single-shot LLM (no memory, no auto-debug).",
+            overrides={
+                **base,
+                "memory_top_k": 0,
+                "use_llm_refinement": False,
+                "debug_case_memory_top_k": 0,
+                "max_retries": 1,
+                "auto_debug": False,
+            },
+        ),
+    ]
+def run_command(cmd: Sequence[str], dry_run: bool = False) -> None:
+    pretty = " ".join(shlex.quote(part) for part in cmd)
+    print(f"  → {pretty}")
+    if dry_run:
+        return
+    subprocess.run(cmd, check=True)
+def compute_attempt_stats(path: Path) -> Tuple[float, int]:
+    if not path.exists():
+        return 0.0, 0
+    total = 0
+    total_attempts = 0
+    multi_attempt = 0
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line:
+                continue
+            record = json.loads(line)
+            attempts = record.get("total_attempts", 1)
+            total_attempts += attempts
+            total += 1
+            if attempts > 1:
+                multi_attempt += 1
+    avg = (total_attempts / total) if total else 0.0
+    return avg, multi_attempt
+def format_percent(value: float) -> str:
+    return f"{value * 100:.1f}%"
+def build_generate_args(
+    dataset: str,
+    output_file: Path,
+    debug_dir: Path,
+    args: argparse.Namespace,
+    cfg: Dict[str, object],
+) -> List[str]:
+    cmd = [
+        os.fspath(GENERATE_SCRIPT),
+        "--dataset",
+        dataset,
+        "--model",
+        args.model,
+        "--temperature",
+        str(args.temperature),
+        "--output",
+        os.fspath(output_file),
+        "--memory_dir",
+        os.fspath(Path(args.memory_dir).resolve()),
+        "--parallel",
+        str(args.parallel),
+        "--execution_timeout",
+        str(args.execution_timeout),
+        "--debug_memory_path",
+        os.fspath(Path(args.debug_memory_path).resolve()),
+        "--debug_case_memory_dir",
+        os.fspath(Path(args.debug_case_dir).resolve()),
+        "--debug_case_memory_top_k",
+        str(int(cfg.get("debug_case_memory_top_k", 0))),
+        "--memory_top_k",
+        str(int(cfg.get("memory_top_k", 0))),
+        "--max_retries",
+        str(int(cfg.get("max_retries", 1))),
+    ]
+    if args.max_problems:
+        cmd += ["--max_problems", str(args.max_problems)]
+    if cfg.get("use_llm_refinement"):
+        cmd.append("--use_llm_refinement")
+    if not cfg.get("filter_perfect", True):
+        cmd.append("--no_filter_perfect")
+    if not cfg.get("auto_debug", True):
+        cmd.append("--no_auto_debug")
+    if debug_dir:
+        cmd += ["--debug_output_dir", os.fspath(debug_dir)]
+    return [os.fspath(part) for part in cmd]
+def build_execute_args(input_file: Path, output_dir: Path, args: argparse.Namespace) -> List[str]:
+    cmd = [
+        os.fspath(EXECUTE_SCRIPT),
+        "--input_file",
+        os.fspath(input_file),
+        "--output_dir",
+        os.fspath(output_dir),
+        "--timeout",
+        str(args.eval_timeout),
+        "--tolerance",
+        str(args.tolerance),
+        "--num_workers",
+        str(args.num_workers),
+        "--memory_dir",
+        os.fspath(Path(args.memory_dir).resolve()),
+        "--debug_memory_path",
+        os.fspath(Path(args.debug_memory_path).resolve()),
+    ]
+    if args.relative_tolerance:
+        cmd.append("--use_relative_tolerance")
+    return cmd
+def summarise_records(records: List[Dict], summary_path: Path) -> None:
+    if not records:
+        return
+    md_lines = [
+        "| Dataset | Variant | Accuracy | Correct/Total | Exec Err % | Timeout % | No-Code % | Avg Attempts | Notes |",
+        "| --- | --- | --- | --- | --- | --- | --- | --- | --- |",
+    ]
+    csv_lines = [
+        "dataset,variant,accuracy,correct,total,exec_error_pct,timeout_pct,no_code_pct,avg_attempts,notes"
+    ]
+    for record in records:
+        dataset = record["dataset"]
+        variant = record["variant"]
+        report = record["report"]
+        status_counts = report.get("status_counts", {})
+        total = report.get("total_problems", 0)
+        accuracy_pct = format_percent(report.get("accuracy", 0.0))
+        correct = report.get("correct", 0)
+        exec_err_pct = (
+            (status_counts.get("execution_error", 0) / total) if total else 0.0
+        )
+        timeout_pct = (status_counts.get("timeout", 0) / total) if total else 0.0
+        no_code_pct = (status_counts.get("no_code", 0) / total) if total else 0.0
+        avg_attempts = record.get("avg_attempts", 0.0)
+        notes = record["notes"]
+        md_lines.append(
+            f"| {dataset} | {variant} | {accuracy_pct} | {correct}/{total} | "
+            f"{exec_err_pct*100:.1f}% | {timeout_pct*100:.1f}% | {no_code_pct*100:.1f}% | "
+            f"{avg_attempts:.2f} | {notes} |"
+        )
+        safe_notes = notes.replace('"', '""')
+        csv_lines.append(
+            f"{dataset},{variant},{report.get('accuracy',0.0):.4f},{correct},{total},"
+            f"{exec_err_pct:.4f},{timeout_pct:.4f},{no_code_pct:.4f},{avg_attempts:.4f},\"{safe_notes}\""
+        )
+    summary_path.write_text("\n".join(md_lines) + "\n", encoding="utf-8")
+    csv_path = summary_path.with_suffix(".csv")
+    csv_path.write_text("\n".join(csv_lines) + "\n", encoding="utf-8")
+    print(f"\n✅ Summary table written to: {summary_path}")
+    print(f"📄 CSV export written to: {csv_path}")
+def main() -> None:
+    args = parse_args()
+    variants = build_variants(args)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_root = Path(args.output_root).resolve() / timestamp
+    if not args.dry_run:
+        run_root.mkdir(parents=True, exist_ok=True)
+    print("========================================")
+    print("Ablation Runner")
+    print("========================================")
+    print(f"Model: {args.model}")
+    print(f"Datasets: {', '.join(args.datasets)}")
+    print(f"Output root: {run_root if not args.dry_run else args.output_root}")
+    print(f"Dry run: {args.dry_run}")
+    print("========================================\n")
+    records: List[Dict] = []
+    for dataset in args.datasets:
+        print(f"Dataset: {dataset}")
+        for variant in variants:
+            cfg = variant.overrides
+            variant_name = variant.name
+            print(f"  Variant: {variant_name} – {variant.description}")
+            dataset_slug = dataset.replace("/", "_")
+            gen_output = (
+                run_root / f"{dataset_slug}_{variant_name}.jsonl"
+                if not args.dry_run
+                else Path(f"{dataset_slug}_{variant_name}.jsonl")
+            )
+            debug_dir = (
+                run_root / "debug" / dataset_slug / variant_name
+                if not args.dry_run
+                else Path(f"debug/{dataset_slug}/{variant_name}")
+            )
+            eval_dir = (
+                run_root / f"{dataset_slug}_{variant_name}_eval"
+                if not args.dry_run
+                else Path(f"{dataset_slug}_{variant_name}_eval")
+            )
+            if not args.dry_run:
+                debug_dir.mkdir(parents=True, exist_ok=True)
+            gen_cmd = [PYTHON_BIN] + build_generate_args(
+                dataset, gen_output, debug_dir, args, cfg
+            )
+            run_command(gen_cmd, dry_run=args.dry_run)
+            exec_cmd = [
+                PYTHON_BIN,
+            ] + build_execute_args(gen_output, eval_dir, args)
+            run_command(exec_cmd, dry_run=args.dry_run)
+            if args.dry_run:
+                continue
+            report_path = eval_dir / "evaluation_report.json"
+            if not report_path.exists():
+                raise FileNotFoundError(
+                    f"Missing evaluation report for {dataset} / {variant_name}: {report_path}"
+                )
+            with report_path.open("r", encoding="utf-8") as handle:
+                report = json.load(handle)
+            avg_attempts, _ = compute_attempt_stats(gen_output)
+            records.append(
+                {
+                    "dataset": dataset,
+                    "variant": variant_name,
+                    "report": report,
+                    "avg_attempts": avg_attempts,
+                    "notes": variant.description,
+                }
+            )
+        print("")
+    if args.dry_run:
+        print("Dry run completed. No commands were executed.")
+        return
+    summary_path = run_root / "ablation_summary.md"
+    summarise_records(records, summary_path)
+if __name__ == "__main__":
+    main()

scripts/run_generate_and_evaluate.sh ADDED Viewed

	@@ -0,0 +1,640 @@

+#!/bin/bash
+set -uo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+OPEN_ROOT="$(cd "${PROJECT_ROOT}/../.." && pwd)"
+SRC_DIR="${PROJECT_ROOT}/src"
+export PYTHONPATH="${SRC_DIR}:${PYTHONPATH:-}"
+# Generate and Evaluate - Combined pipeline for generation + evaluation
+# Usage: ./run_generate_and_evaluate.sh [model_name] [max_problems] [num_workers] [timeout] [tolerance] [dataset_name]
+#
+# Environment Variables:
+#   REFRESH_DEBUG_MEMORY - Set to "false" to disable auto-backup and clearing of debug memory (default: true)
+#   RUN_ALL_BENCHMARKS - Set to "true" to run all benchmarks in ./data/benchmarks/ (default: true)
+#   USE_HF_OFFLINE - Set to "false" to allow downloading models from Hugging Face (default: true)
+#   PARALLEL_BENCHMARKS - Set to "true" to run benchmarks in parallel (default: true)
+#   MAX_PARALLEL_JOBS - Maximum number of parallel jobs (default: 4)
+#   DATASET_NAME - Dataset to run when RUN_ALL_BENCHMARKS=false (default: IndustryOR)
+#   EMBEDDING_MODEL - Optional embedding model name or local path passed to memory retrieval
+#
+# Example:
+#   ./run_generate_and_evaluate.sh                    # Run with default settings (all benchmarks, offline mode, parallel)
+#   RUN_ALL_BENCHMARKS=false ./run_generate_and_evaluate.sh  # Run single dataset
+#   RUN_ALL_BENCHMARKS=false ./run_generate_and_evaluate.sh gpt-4o 100 64 90 0.05 OPT-Principled
+#   USE_HF_OFFLINE=false ./run_generate_and_evaluate.sh  # Allow downloading models
+#   REFRESH_DEBUG_MEMORY=false ./run_generate_and_evaluate.sh  # Run without refreshing debug memory
+#   PARALLEL_BENCHMARKS=false ./run_generate_and_evaluate.sh  # Run sequentially
+#   MAX_PARALLEL_JOBS=2 ./run_generate_and_evaluate.sh  # Limit to 2 parallel jobs
+MODEL=${1:-"gpt-4o"}
+MAX_PROBLEMS=${2:-1000}
+NUM_WORKERS=${3:-100}
+TIMEOUT=${4:-60}
+TOLERANCE=${5:-0.05}
+# Configuration: Auto-backup and clear debug memory before running
+# Set to "false" to disable this feature
+REFRESH_DEBUG_MEMORY=${REFRESH_DEBUG_MEMORY:-true}
+# Configuration: Run all benchmarks or single dataset
+RUN_ALL_BENCHMARKS=${RUN_ALL_BENCHMARKS:-true}
+# Configuration: Use offline mode for Hugging Face (avoid network calls)
+# Set to "false" if you need to download models for the first time
+USE_HF_OFFLINE=${USE_HF_OFFLINE:-true}
+# Configuration: Run benchmarks in parallel
+# Set to "true" to enable concurrent datasets (default: sequential datasets)
+PARALLEL_BENCHMARKS=${PARALLEL_BENCHMARKS:-false}
+# Configuration: Maximum number of parallel jobs
+# Adjust based on your system resources
+MAX_PARALLEL_JOBS=${MAX_PARALLEL_JOBS:-4}
+# Default single dataset
+DEFAULT_DATASET=${DATASET_NAME:-${6:-"IndustryOR"}}
+# DEFAULT_DATASET="ComplexOR"
+TEMPERATURE=${TEMPERATURE:-0.01}
+MEMORY_DIR="${PROJECT_ROOT}/memory_storage"
+MEMORY_TOP_K=${MEMORY_TOP_K:-3}
+PARALLEL=${PARALLEL:-128}
+MAIN_TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+OUTPUT_DIR="${OPEN_ROOT}/results/Agora-Opt/generate_and_evaluate"
+MAX_RETRIES=${MAX_RETRIES:-5}
+BENCHMARKS_DIR="${PROJECT_ROOT}/../../data/benchmarks"
+EMBEDDING_MODEL=${EMBEDDING_MODEL:-}
+GENERATE_CLI="${PROJECT_ROOT}/scripts/generate_with_memory.py"
+EXECUTE_CLI="${PROJECT_ROOT}/scripts/execute.py"
+if [ -d "${BENCHMARKS_DIR}" ]; then
+    BENCHMARKS_DIR="$(cd "${BENCHMARKS_DIR}" && pwd)"
+elif [ -d "${PROJECT_ROOT}/clean_benchmarks" ]; then
+    BENCHMARKS_DIR="$(cd "${PROJECT_ROOT}/clean_benchmarks" && pwd)"
+elif [ -d "${PROJECT_ROOT}/../clean_benchmarks" ]; then
+    BENCHMARKS_DIR="$(cd "${PROJECT_ROOT}/../clean_benchmarks" && pwd)"
+fi
+# Create output directory
+mkdir -p "${OUTPUT_DIR}"
+ensure_or_debate_env() {
+    if [ "${CONDA_DEFAULT_ENV:-}" = "or-debate" ] && command -v python >/dev/null 2>&1; then
+        return 0
+    fi
+    if ! command -v conda >/dev/null 2>&1; then
+        echo "❌ conda command not found. Please install Conda or activate the or-debate environment manually."
+        return 1
+    fi
+    local conda_bin
+    local conda_base
+    conda_bin="$(command -v conda)"
+    conda_base="$(cd "$(dirname "${conda_bin}")/.." && pwd)"
+    if [ -f "${conda_base}/etc/profile.d/conda.sh" ]; then
+        # shellcheck disable=SC1090
+        source "${conda_base}/etc/profile.d/conda.sh"
+    else
+        eval "$("${conda_bin}" shell.bash hook)"
+    fi
+    conda activate or-debate
+}
+# ============================================
+# Function: Backup and Clear Debug Memory
+# ============================================
+backup_debug_memory() {
+    if [ "${REFRESH_DEBUG_MEMORY}" = "true" ]; then
+        DEBUG_MEMORY_FILE="${MEMORY_DIR}/debug_memory.jsonl"
+        BACKUP_DIR="${MEMORY_DIR}/backups/${MAIN_TIMESTAMP}"
+        if [ -f "${DEBUG_MEMORY_FILE}" ]; then
+            echo "================================================"
+            echo "🗂️  Backing up debug memory..."
+            echo "================================================"
+            # Create backup directory
+            mkdir -p ${BACKUP_DIR}
+            # Copy debug_memory.jsonl to backup
+            cp "${DEBUG_MEMORY_FILE}" "${BACKUP_DIR}/debug_memory.jsonl"
+            # Get file size and line count
+            FILE_SIZE=$(du -h "${DEBUG_MEMORY_FILE}" | cut -f1)
+            LINE_COUNT=$(wc -l < "${DEBUG_MEMORY_FILE}")
+            echo "✅ Backed up debug memory:"
+            echo "  Location: ${BACKUP_DIR}/debug_memory.jsonl"
+            echo "  Size:     ${FILE_SIZE}"
+            echo "  Lines:    ${LINE_COUNT}"
+            # Clear the original file
+            > "${DEBUG_MEMORY_FILE}"
+            echo "✅ Cleared original debug memory file"
+            echo ""
+        else
+            echo "ℹ️  No debug memory file found, skipping backup"
+            echo ""
+        fi
+    else
+        echo "ℹ️  Debug memory refresh is disabled (REFRESH_DEBUG_MEMORY=false)"
+        echo ""
+    fi
+}
+normalize_dataset_name() {
+    local dataset_name="$1"
+    dataset_name="${dataset_name%.jsonl}"
+    case "${dataset_name}" in
+        ComplexLP_clean) echo "ComplexLP" ;;
+        EasyLP_clean) echo "EasyLP" ;;
+        IndustryOR_clean|IndustryOR_v2|IndustryOR_fixedV2|IndustryOR_fixedV2_clean) echo "IndustryOR" ;;
+        NL4Opt|NL4Opt_clean|NL4OPT_clean) echo "NL4OPT" ;;
+        NLP4LP_clean) echo "NLP4LP" ;;
+        ComplexOR_clean) echo "ComplexOR" ;;
+        ReSocratic_clean) echo "ReSocratic" ;;
+        combined|combined_dataset|OPT-Principled_clean) echo "OPT-Principled" ;;
+        *) echo "${dataset_name}" ;;
+    esac
+}
+DEFAULT_DATASET="$(normalize_dataset_name "${DEFAULT_DATASET}")"
+# ============================================
+# Function: Run single dataset (core logic)
+# ============================================
+process_dataset() {
+    local DATASET_NAME
+    DATASET_NAME="$(normalize_dataset_name "$1")"
+    local TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+    local OUTPUT_FILE="${OUTPUT_DIR}/${MODEL}_${DATASET_NAME}_${TIMESTAMP}.jsonl"
+    local EVAL_FILE="${OUTPUT_DIR}/${MODEL}_${DATASET_NAME}_eval_${TIMESTAMP}.jsonl"
+    local EVAL_REPORT="${EVAL_FILE}/evaluation_report.json"
+    echo ""
+    echo "╔════════════════════════════════════════════════╗"
+    echo "║  Processing Dataset: ${DATASET_NAME}"
+    echo "╚════════════════════════════════════════════════╝"
+    echo ""
+    # ============================================
+    # STEP 1: Generation
+    # ============================================
+    echo "================================================"
+    echo "📝 STEP 1/2: Generating code with memory..."
+    echo "================================================"
+    echo "Dataset: ${DATASET_NAME}"
+    echo ""
+    local generate_args=(
+        --dataset "${DATASET_NAME}"
+        --model "${MODEL}"
+        --temperature "${TEMPERATURE}"
+        --max_problems "${MAX_PROBLEMS}"
+        --memory_dir "${MEMORY_DIR}"
+        --memory_top_k "${MEMORY_TOP_K}"
+        --parallel "${PARALLEL}"
+        --output "${OUTPUT_FILE}"
+        --max_retries "${MAX_RETRIES}"
+        --execution_timeout 60
+    )
+    if [ -n "${EMBEDDING_MODEL}" ]; then
+        generate_args+=(--embedding_model "${EMBEDDING_MODEL}")
+    fi
+    python "${GENERATE_CLI}" "${generate_args[@]}"
+    EXIT_CODE=$?
+    if [ ${EXIT_CODE} -ne 0 ]; then
+        echo ""
+        echo "❌ Generation failed for ${DATASET_NAME} with exit code ${EXIT_CODE}"
+        return 1
+    fi
+    echo ""
+    echo "✅ Generation completed for ${DATASET_NAME}!"
+    echo ""
+    # Show generation summary
+    if [ -f "${OUTPUT_FILE}" ]; then
+        TOTAL=$(wc -l < ${OUTPUT_FILE})
+        SUCCESS=$(grep -c '"status": "success"' "${OUTPUT_FILE}" 2>/dev/null || true)
+        if [ -z "${SUCCESS}" ]; then
+            SUCCESS=0
+        fi
+        echo "📊 Generation Summary:"
+        echo "  Total problems: ${TOTAL}"
+        echo "  Successful:     ${SUCCESS}"
+        if [ "${SUCCESS}" -eq 0 ]; then
+            echo ""
+            echo "❌ Generation produced zero successful solutions for ${DATASET_NAME}"
+            echo "   Refusing to continue with an incomplete run."
+            return 1
+        fi
+    fi
+    echo ""
+    # ============================================
+    # STEP 2: Evaluation
+    # ============================================
+    echo "================================================"
+    echo "🔍 STEP 2/2: Executing and evaluating..."
+    echo "================================================"
+    echo ""
+    local execute_args=(
+        --input_file "${OUTPUT_FILE}"
+        --output_dir "${EVAL_FILE}"
+        --num_workers "${NUM_WORKERS}"
+        --timeout "${TIMEOUT}"
+        --tolerance "${TOLERANCE}"
+        --use_relative_tolerance
+    )
+    if [ -n "${EMBEDDING_MODEL}" ]; then
+        execute_args+=(--embedding_model "${EMBEDDING_MODEL}")
+    fi
+    python "${EXECUTE_CLI}" "${execute_args[@]}"
+    EXIT_CODE=$?
+    if [ ${EXIT_CODE} -ne 0 ]; then
+        echo ""
+        echo "❌ Evaluation failed for ${DATASET_NAME} with exit code ${EXIT_CODE}"
+        return 1
+    fi
+    echo ""
+    echo "✅ Evaluation completed for ${DATASET_NAME}!"
+    echo ""
+    # Show evaluation report if exists
+    if [ -f "${EVAL_REPORT}" ]; then
+        echo "📊 Evaluation Results for ${DATASET_NAME}:"
+        cat "${EVAL_REPORT}" | jq '{
+            accuracy: .accuracy,
+            correct: .correct,
+            total: .total_problems,
+            status_counts: .status_counts
+        }' 2>/dev/null || cat "${EVAL_REPORT}"
+        echo ""
+        # Store results for final summary (with lock for parallel execution)
+        ACCURACY=$(cat "${EVAL_REPORT}" | jq -r '.accuracy' 2>/dev/null || echo "N/A")
+        CORRECT=$(cat "${EVAL_REPORT}" | jq -r '.correct' 2>/dev/null || echo "N/A")
+        TOTAL_PROBS=$(cat "${EVAL_REPORT}" | jq -r '.total_problems' 2>/dev/null || echo "N/A")
+        # Use lock to safely append to results file (fallback to simple append if flock not available)
+        RESULTS_LOCK="${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.lock"
+        if command -v flock >/dev/null 2>&1; then
+            (
+                flock -x 200
+                echo "${DATASET_NAME}|${ACCURACY}|${CORRECT}|${TOTAL_PROBS}|${EVAL_FILE}" >> "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
+            ) 200>"${RESULTS_LOCK}"
+        else
+            # Fallback: use simple append (may have race condition but unlikely with small writes)
+            echo "${DATASET_NAME}|${ACCURACY}|${CORRECT}|${TOTAL_PROBS}|${EVAL_FILE}" >> "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
+        fi
+    fi
+    echo "================================================"
+    echo ""
+    if [ -f "${EVAL_REPORT}" ]; then
+        return 0
+    else
+        return 1
+    fi
+}
+# ============================================
+# Function: Run single dataset (internal, supports logging)
+# ============================================
+run_single_dataset_internal() {
+    local DATASET_NAME=$1
+    local LOG_FILE=$2
+    local STREAM_OUTPUT=${3:-false}
+    if [ "${STREAM_OUTPUT}" = "true" ]; then
+        process_dataset "${DATASET_NAME}" |& tee "${LOG_FILE}"
+        local EXIT_CODE=${PIPESTATUS[0]}
+        return ${EXIT_CODE}
+    else
+        process_dataset "${DATASET_NAME}" > "${LOG_FILE}" 2>&1
+        return $?
+    fi
+}
+# ============================================
+# Function: Run single dataset (wrapper for sequential execution)
+# ============================================
+run_single_dataset() {
+    local DATASET_NAME=$1
+    local STREAM_OUTPUT=${2:-false}
+    local LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
+    run_single_dataset_internal "${DATASET_NAME}" "${LOG_FILE}" "${STREAM_OUTPUT}"
+    local EXIT_CODE=$?
+    # Display output only when we did not already stream it live
+    if [ "${STREAM_OUTPUT}" != "true" ]; then
+        cat "${LOG_FILE}"
+    fi
+    return ${EXIT_CODE}
+}
+# ============================================
+# Main Execution
+# ============================================
+echo "================================================"
+echo "🚀 Generate + Evaluate Pipeline"
+echo "================================================"
+echo "Model:        ${MODEL}"
+echo "Max problems: ${MAX_PROBLEMS}"
+echo "Temperature:  ${TEMPERATURE}"
+echo "Memory dir:   ${MEMORY_DIR}"
+echo "Memory Top-K: ${MEMORY_TOP_K}"
+if [ -n "${EMBEDDING_MODEL}" ]; then
+    echo "Embedding:    ${EMBEDDING_MODEL}"
+else
+    echo "Embedding:    MemoryBank default"
+fi
+echo "Parallel:     ${PARALLEL}"
+echo "Refresh Memory: ${REFRESH_DEBUG_MEMORY}"
+echo "Run All Benchmarks: ${RUN_ALL_BENCHMARKS}"
+echo "HF Offline:   ${USE_HF_OFFLINE}"
+echo "Parallel Benchmarks: ${PARALLEL_BENCHMARKS}"
+if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
+    echo "Max Parallel Jobs: ${MAX_PARALLEL_JOBS}"
+fi
+echo ""
+echo "Eval Workers: ${NUM_WORKERS}"
+echo "Eval Timeout: ${TIMEOUT}s"
+echo "Tolerance:    ${TOLERANCE} (relative)"
+echo ""
+echo "Max retries:  ${MAX_RETRIES}"
+echo "================================================"
+echo ""
+# Activate environment
+ensure_or_debate_env || exit 1
+# Set Hugging Face offline mode if enabled
+if [ "${USE_HF_OFFLINE}" = "true" ]; then
+    echo "ℹ️  Hugging Face offline mode enabled (using local cache)"
+    export HF_HUB_OFFLINE=1
+    export TRANSFORMERS_OFFLINE=1
+    export HF_DATASETS_OFFLINE=1
+else
+    echo "ℹ️  Hugging Face online mode (may download models if needed)"
+fi
+echo ""
+# Backup and clear debug memory (only once at the beginning)
+backup_debug_memory
+# ============================================
+# Run benchmarks
+# ============================================
+if [ "${RUN_ALL_BENCHMARKS}" = "true" ]; then
+    if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
+        echo "================================================"
+        echo "🔄 Running ALL benchmarks in PARALLEL"
+        echo "================================================"
+    else
+        echo "================================================"
+        echo "🔄 Running ALL benchmarks SEQUENTIALLY"
+        echo "================================================"
+    fi
+    echo ""
+    # Define benchmark dataset names in specified order (without .jsonl extension)
+    # Modify this array to change the execution order
+    BENCHMARK_NAMES=(
+        "NL4OPT"
+        "EasyLP"
+        "ComplexLP"
+        "NLP4LP"
+        "ComplexOR"
+        "IndustryOR"
+        "ReSocratic"
+        "OPT-Principled"
+    )
+    # Count total benchmarks
+    TOTAL_BENCHMARKS=${#BENCHMARK_NAMES[@]}
+    FAILED=0
+    SKIPPED=0
+    echo "Total benchmarks to process: ${TOTAL_BENCHMARKS}"
+    echo ""
+    echo "Execution order:"
+    for i in "${!BENCHMARK_NAMES[@]}"; do
+        echo "  $((i+1)). ${BENCHMARK_NAMES[$i]}"
+    done
+    echo ""
+    # Initialize batch results file
+    echo "Dataset|Accuracy|Correct|Total|Output" > "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
+    # Create lock file for parallel execution
+    RESULTS_LOCK="${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.lock"
+    touch "${RESULTS_LOCK}"
+    # Process benchmarks (parallel or sequential)
+    if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
+        # Parallel execution
+        declare -a PIDS=()
+        declare -a DATASET_NAMES=()
+        CURRENT_JOBS=0
+        for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
+            BENCHMARK_FILE="${BENCHMARKS_DIR}/${DATASET_NAME}.jsonl"
+            # Check if file exists
+            if [ ! -f "${BENCHMARK_FILE}" ]; then
+                echo "⚠️  File not found: ${BENCHMARK_FILE}"
+                echo "   Skipping ${DATASET_NAME}..."
+                SKIPPED=$((SKIPPED + 1))
+                continue
+            fi
+            # Wait for available slot if max jobs reached
+            while true; do
+                # Count running jobs
+                CURRENT_JOBS=0
+                for PID in "${PIDS[@]}"; do
+                    if kill -0 ${PID} 2>/dev/null; then
+                        CURRENT_JOBS=$((CURRENT_JOBS + 1))
+                    fi
+                done
+                # Break if we have available slots
+                if [ ${CURRENT_JOBS} -lt ${MAX_PARALLEL_JOBS} ]; then
+                    break
+                fi
+                # Wait a bit before checking again
+                sleep 1
+            done
+            # Start job in background
+            LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
+            echo "🚀 Starting ${DATASET_NAME} (log: ${LOG_FILE})"
+            (
+                run_single_dataset_internal "${DATASET_NAME}" "${LOG_FILE}"
+                EXIT_CODE=$?
+                if [ ${EXIT_CODE} -ne 0 ]; then
+                    echo "[${DATASET_NAME}] ❌ Failed with exit code ${EXIT_CODE}" >> "${OUTPUT_DIR}/failures_${MAIN_TIMESTAMP}.txt"
+                else
+                    echo "[${DATASET_NAME}] ✅ Completed successfully" >> "${OUTPUT_DIR}/success_${MAIN_TIMESTAMP}.txt"
+                fi
+            ) &
+            PID=$!
+            PIDS+=(${PID})
+            DATASET_NAMES+=("${DATASET_NAME}")
+        done
+        # Wait for all jobs to complete
+        echo ""
+        echo "⏳ Waiting for all jobs to complete..."
+        echo ""
+        for i in "${!PIDS[@]}"; do
+            PID=${PIDS[$i]}
+            DATASET_NAME=${DATASET_NAMES[$i]}
+            wait ${PID}
+            EXIT_CODE=$?
+            if [ ${EXIT_CODE} -ne 0 ]; then
+                FAILED=$((FAILED + 1))
+                echo "⚠️  ${DATASET_NAME} failed with exit code ${EXIT_CODE}"
+            fi
+        done
+        # Clean up lock file
+        rm -f "${RESULTS_LOCK}"
+        echo ""
+        echo "================================================"
+        echo "📋 Individual Job Logs:"
+        echo "================================================"
+        for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
+            LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
+            if [ -f "${LOG_FILE}" ]; then
+                echo ""
+                echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+                echo "  ${DATASET_NAME} - Log File: ${LOG_FILE}"
+                echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+                tail -20 "${LOG_FILE}"
+            fi
+        done
+        echo ""
+    else
+        # Sequential execution
+        CURRENT=0
+        for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
+            CURRENT=$((CURRENT + 1))
+            BENCHMARK_FILE="${BENCHMARKS_DIR}/${DATASET_NAME}.jsonl"
+            echo ""
+            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+            echo "  Progress: ${CURRENT}/${TOTAL_BENCHMARKS}"
+            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+            # Check if file exists
+            if [ ! -f "${BENCHMARK_FILE}" ]; then
+                echo "⚠️  File not found: ${BENCHMARK_FILE}"
+                echo "   Skipping..."
+                SKIPPED=$((SKIPPED + 1))
+                continue
+            fi
+            run_single_dataset "${DATASET_NAME}" true
+            if [ $? -ne 0 ]; then
+                FAILED=$((FAILED + 1))
+                echo "⚠️  Failed to process ${DATASET_NAME}, continuing..."
+            fi
+            echo ""
+        done
+        # Clean up lock file
+        rm -f "${RESULTS_LOCK}"
+    fi
+    # ============================================
+    # Final Summary for All Benchmarks
+    # ============================================
+    echo ""
+    echo "================================================"
+    echo "🎉 All Benchmarks Complete!"
+    echo "================================================"
+    echo ""
+    echo "Summary:"
+    echo "  Total benchmarks: ${TOTAL_BENCHMARKS}"
+    echo "  Successful:       $((TOTAL_BENCHMARKS - FAILED - SKIPPED))"
+    echo "  Failed:           ${FAILED}"
+    echo "  Skipped:          ${SKIPPED}"
+    echo ""
+    echo "📊 Detailed Results:"
+    echo "================================================"
+    # Display formatted results table
+    if [ -f "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt" ]; then
+        echo ""
+        printf "%-35s | %-10s | %-10s | %-10s\n" "Dataset" "Accuracy" "Correct" "Total"
+        echo "--------------------------------------------------------------------------------"
+        tail -n +2 "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt" | while IFS='|' read -r dataset accuracy correct total output; do
+            printf "%-35s | %-10s | %-10s | %-10s\n" "${dataset}" "${accuracy}" "${correct}" "${total}"
+        done
+        echo ""
+        echo "📁 Full results saved to: ${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
+    fi
+    echo ""
+    echo "================================================"
+else
+    # Run single dataset mode
+    echo "================================================"
+    echo "📝 Running single dataset: ${DEFAULT_DATASET}"
+    echo "================================================"
+    echo ""
+    BENCHMARK_FILE="${BENCHMARKS_DIR}/${DEFAULT_DATASET}.jsonl"
+    if [ ! -f "${BENCHMARK_FILE}" ]; then
+        echo "❌ Dataset file not found: ${BENCHMARK_FILE}"
+        exit 1
+    fi
+    run_single_dataset "${DEFAULT_DATASET}" true
+    if [ $? -ne 0 ]; then
+        echo ""
+        echo "❌ Pipeline failed"
+        exit 1
+    fi
+    echo ""
+    echo "🎉 Pipeline Complete!"
+fi
+echo ""
+echo "✨ All done! Check the results above."
+echo ""

scripts/run_memory_debate.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env python3
+"""Wrapper for debate_memory.run_memory_debate."""
+from pathlib import Path
+import sys
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from debate_memory.run_memory_debate import main
+if __name__ == "__main__":
+    main()

scripts/test_self_healing_full.sh ADDED Viewed

	@@ -0,0 +1,92 @@

+#!/bin/bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+SRC_DIR="${PROJECT_ROOT}/src"
+export PYTHONPATH="${SRC_DIR}:${PYTHONPATH:-}"
+GENERATE_CLI="${PROJECT_ROOT}/scripts/generate_with_memory.py"
+# Test self-healing mechanism with a small sample
+# This will test the full pipeline with just 3 problems
+echo "================================================"
+echo "🧪 Testing Self-Healing Mechanism"
+echo "================================================"
+echo ""
+# Activate conda environment
+source ~/miniconda3/etc/profile.d/conda.sh
+conda activate or-debate
+# Test parameters
+MODEL="deepseek-chat"
+DATASET="IndustryOR"
+MAX_PROBLEMS=3
+OUTPUT_DIR="${PROJECT_ROOT}/test_output"
+OUTPUT_FILE="${OUTPUT_DIR}/test_self_healing_$(date +%Y%m%d_%H%M%S).jsonl"
+MEMORY_DIR="${PROJECT_ROOT}/memory_storage"
+MAX_RETRIES=3
+mkdir -p "${OUTPUT_DIR}"
+echo "Configuration:"
+echo "  Model:        ${MODEL}"
+echo "  Dataset:      ${DATASET}"
+echo "  Max problems: ${MAX_PROBLEMS}"
+echo "  Max retries:  ${MAX_RETRIES}"
+echo "  Output:       ${OUTPUT_FILE}"
+echo ""
+# Run generation with self-healing
+set +e
+python "${GENERATE_CLI}" \
+    --dataset "${DATASET}" \
+    --model "${MODEL}" \
+    --max_problems "${MAX_PROBLEMS}" \
+    --output "${OUTPUT_FILE}" \
+    --memory_dir "${MEMORY_DIR}" \
+    --memory_top_k 3 \
+    --parallel 1 \
+    --max_retries "${MAX_RETRIES}" \
+    --execution_timeout 60
+EXIT_CODE=$?
+set -e
+if [ ${EXIT_CODE} -ne 0 ]; then
+    echo ""
+    echo "❌ Test failed with exit code ${EXIT_CODE}"
+    exit 1
+fi
+echo ""
+echo "================================================"
+echo "📊 Test Results"
+echo "================================================"
+if [ -f "${OUTPUT_FILE}" ]; then
+    TOTAL=$(wc -l < "${OUTPUT_FILE}")
+    echo "Total problems processed: ${TOTAL}"
+    # Count successes
+    SUCCESS=$(grep -c '"execution_status": "success"' "${OUTPUT_FILE}" 2>/dev/null || echo 0)
+    echo "Successful executions:    ${SUCCESS}"
+    # Count with retries
+    RETRIED=$(grep -c '"total_attempts": [2-9]' "${OUTPUT_FILE}" 2>/dev/null || echo 0)
+    echo "Problems that used retry: ${RETRIED}"
+    # Show sample result
+    echo ""
+    echo "Sample result (problem 1):"
+    head -1 "${OUTPUT_FILE}" | python -m json.tool | grep -E '"id"|"execution_status"|"total_attempts"|"self_healing_enabled"'
+    echo ""
+    echo "✅ Test completed successfully!"
+    echo "Full results saved to: ${OUTPUT_FILE}"
+else
+    echo "❌ Output file not found: ${OUTPUT_FILE}"
+    exit 1
+fi

src/debate_memory/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Debate-with-memory v2 core package."""
+from importlib import metadata
+try:
+    __version__ = metadata.version("debate-memory")
+except metadata.PackageNotFoundError:  # pragma: no cover - local usage
+    __version__ = "0.0.0"
+__all__ = ["__version__"]

src/debate_memory/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (404 Bytes). View file

src/debate_memory/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (541 Bytes). View file

src/debate_memory/__pycache__/build_memory_from_eval_results.cpython-311.pyc ADDED Viewed

Binary file (14.4 kB). View file

src/debate_memory/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (4.82 kB). View file

src/debate_memory/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (6.6 kB). View file

src/debate_memory/__pycache__/debate_memory_builder.cpython-311.pyc ADDED Viewed

Binary file (23.1 kB). View file

src/debate_memory/__pycache__/debug_executor.cpython-310.pyc ADDED Viewed

Binary file (3.7 kB). View file

src/debate_memory/__pycache__/debug_memory.cpython-310.pyc ADDED Viewed

Binary file (5.19 kB). View file

src/debate_memory/__pycache__/debug_memory_builder.cpython-311.pyc ADDED Viewed

Binary file (8.81 kB). View file

src/debate_memory/__pycache__/generate_with_memory.cpython-310.pyc ADDED Viewed

Binary file (24 kB). View file

src/debate_memory/__pycache__/generate_with_memory.cpython-311.pyc ADDED Viewed

Binary file (40.9 kB). View file

src/debate_memory/__pycache__/llm.cpython-310.pyc ADDED Viewed

Binary file (3.16 kB). View file

src/debate_memory/__pycache__/llm.cpython-311.pyc ADDED Viewed

Binary file (4.98 kB). View file

src/debate_memory/__pycache__/memory_bank.cpython-310.pyc ADDED Viewed

Binary file (9.19 kB). View file

src/debate_memory/__pycache__/memory_bank.cpython-311.pyc ADDED Viewed

Binary file (15.3 kB). View file

src/debate_memory/__pycache__/run_memory_debate.cpython-311.pyc ADDED Viewed

Binary file (27.6 kB). View file

src/debate_memory/augment_memory_from_standalone_runs.py ADDED Viewed

	@@ -0,0 +1,974 @@

+#!/usr/bin/env python3
+"""Build non-destructive memory variants from standalone pipeline runs."""
+from __future__ import annotations
+import argparse
+import glob
+import hashlib
+import json
+import shutil
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+from llama_index.core import Document
+from .memory_bank import MemoryBank
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent.parent
+DEFAULT_BASE_ROOT = PROJECT_ROOT
+DEFAULT_VARIANTS_ROOT = PROJECT_ROOT / "memory_variants"
+DEFAULT_STANDALONE_ROOT = Path("/home/datagen/OR-Debate/standalone_pipeline/runs")
+MAIN_MEMORY_DIRNAME = "memory_storage"
+DEBUG_CASE_MEMORY_DIRNAME = "debug_case_memory"
+DEBATE_MEMORY_DIRNAME = "debate_memory_storage"
+DEBUG_MEMORY_FILENAME = "debug_memory.jsonl"
+DEBUG_FAILURE_STATUSES = {
+    "execution_error",
+    "error",
+    "timeout",
+    "no_code",
+    "not_executed",
+    "success_no_objective",
+    "execution_failed",
+}
+PROMPT_ARTIFACT_HEADERS = (
+    "\n# Retrieved Historical Cases",
+    "\n# Debate Memory Insights",
+    "\n# Retrieved Debug Guidance",
+)
+@dataclass
+class RunArtifacts:
+    source_root: Path
+    run_dir: Path
+    dataset: str
+    model_a: str
+    model_b: str
+    single_generated: Dict[str, Path]
+    debate_results: Optional[Path]
+    consensus_jsonl: Optional[Path]
+    consensus_eval: Optional[Path]
+    manifest_path: Optional[Path]
+    @property
+    def has_complete_debate(self) -> bool:
+        return bool(
+            self.debate_results
+            and self.consensus_jsonl
+            and self.consensus_eval
+            and self.debate_results.exists()
+            and self.consensus_jsonl.exists()
+            and self.consensus_eval.exists()
+        )
+@dataclass
+class ReferenceSolution:
+    source: str
+    model: str
+    code: str
+    objective_value: Optional[float]
+    chosen_model: Optional[str]
+def now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def now_stamp() -> str:
+    return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+def load_jsonl(path: Path) -> List[Dict]:
+    rows: List[Dict] = []
+    if not path or not path.exists():
+        return rows
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                rows.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return rows
+def append_jsonl(path: Path, rows: Iterable[Dict]) -> int:
+    count = 0
+    with path.open("a", encoding="utf-8") as fh:
+        for row in rows:
+            fh.write(json.dumps(row, ensure_ascii=False) + "\n")
+            count += 1
+    return count
+def load_json(path: Path) -> Dict:
+    if not path.exists():
+        return {}
+    with path.open("r", encoding="utf-8") as fh:
+        return json.load(fh)
+def dump_json(path: Path, payload: Dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as fh:
+        json.dump(payload, fh, ensure_ascii=False, indent=2, sort_keys=True)
+def count_jsonl_lines(path: Path) -> int:
+    if not path.exists():
+        return 0
+    with path.open("r", encoding="utf-8") as fh:
+        return sum(1 for _ in fh if _.strip())
+def float_or_none(value) -> Optional[float]:
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+def infer_models_from_run_name(run_name: str) -> Tuple[str, str]:
+    parts = run_name.split("_vs_")
+    if len(parts) != 2:
+        return "modelA", "modelB"
+    left = parts[0].split("_")
+    if len(left) < 2:
+        return left[-1], parts[1]
+    return "_".join(left[1:]), parts[1]
+def clean_description(text: str) -> str:
+    cleaned = (text or "").strip()
+    for header in PROMPT_ARTIFACT_HEADERS:
+        pos = cleaned.find(header)
+        if pos != -1:
+            cleaned = cleaned[:pos].rstrip()
+    return cleaned
+def check_correctness(
+    pred_obj: Optional[float],
+    gt_obj: Optional[float],
+    tolerance: float,
+    use_relative_tolerance: bool,
+) -> bool:
+    if pred_obj is None or gt_obj is None:
+        return False
+    if gt_obj == 0:
+        return abs(pred_obj) <= tolerance
+    if use_relative_tolerance:
+        return abs((pred_obj - gt_obj) / gt_obj) <= tolerance
+    return abs(pred_obj - gt_obj) <= tolerance
+def sha1_short(text: str, length: int = 16) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()[:length]
+def build_doc(problem_id: int, description: str, solution_code: str, objective_value: float, metadata: Dict) -> Document:
+    doc_text = f"""Problem: {description}
+Solution approach:
+{solution_code[:500]}...
+Key features:
+- Problem ID: {problem_id}
+- Objective value: {objective_value}
+- Status: Correct
+"""
+    return Document(
+        text=doc_text,
+        metadata={
+            "problem_id": problem_id,
+            "objective_value": objective_value,
+            **metadata,
+        },
+    )
+class BatchMemoryAppender:
+    def __init__(self, memory_dir: Path, embedding_model: str) -> None:
+        self.memory_dir = memory_dir
+        self.bank = MemoryBank(memory_dir=str(memory_dir), embedding_model=embedding_model)
+        self.pending_cases: List[Dict] = []
+        self.pending_docs: List[Document] = []
+    def add_case(
+        self,
+        *,
+        problem_id: int,
+        problem_desc: str,
+        solution_code: str,
+        objective_value: float,
+        metadata: Dict,
+    ) -> None:
+        case = {
+            "problem_id": int(problem_id),
+            "description": problem_desc,
+            "solution_code": solution_code,
+            "objective_value": objective_value,
+            "is_correct": True,
+            "metadata": metadata,
+        }
+        self.pending_cases.append(case)
+        self.pending_docs.append(
+            build_doc(
+                problem_id=int(problem_id),
+                description=problem_desc,
+                solution_code=solution_code,
+                objective_value=objective_value,
+                metadata=metadata,
+            )
+        )
+    def finalize(self) -> int:
+        if not self.pending_cases:
+            return 0
+        with Path(self.bank.cases_file).open("a", encoding="utf-8") as fh:
+            for case in self.pending_cases:
+                fh.write(json.dumps(case, ensure_ascii=False) + "\n")
+        for doc in self.pending_docs:
+            self.bank.index.insert(doc)
+        self.bank.index.storage_context.persist(persist_dir=self.bank.index_dir)
+        added = len(self.pending_cases)
+        self.pending_cases.clear()
+        self.pending_docs.clear()
+        return added
+def resolve_source_roots(patterns: Sequence[str]) -> List[Path]:
+    resolved: List[Path] = []
+    for pattern in patterns:
+        matches = glob.glob(pattern)
+        if matches:
+            for match in matches:
+                path = Path(match)
+                if path.is_dir():
+                    resolved.append(path.resolve())
+        else:
+            path = Path(pattern)
+            if path.is_dir():
+                resolved.append(path.resolve())
+    deduped = sorted({path for path in resolved})
+    return deduped
+def resolve_file(run_dir: Path, raw_value: Optional[str]) -> Optional[Path]:
+    if not raw_value:
+        return None
+    candidate = Path(raw_value)
+    if not candidate.is_absolute():
+        candidate = run_dir / candidate
+    return candidate if candidate.exists() else None
+def discover_run_artifacts(source_root: Path) -> List[RunArtifacts]:
+    runs: List[RunArtifacts] = []
+    if not source_root.exists():
+        return runs
+    for run_dir in sorted(source_root.iterdir()):
+        if not run_dir.is_dir():
+            continue
+        manifest_path = run_dir / "run_manifest.json"
+        manifest = load_json(manifest_path) if manifest_path.exists() else {}
+        model_a, model_b = infer_models_from_run_name(run_dir.name)
+        model_a = manifest.get("model_a", model_a)
+        model_b = manifest.get("model_b", model_b)
+        dataset = manifest.get("dataset", source_root.name)
+        single_generated: Dict[str, Path] = {}
+        for generated in sorted(run_dir.glob("single/*/generated.jsonl")):
+            model_name = generated.parent.name
+            single_generated[model_name] = generated
+        model_a_generated = resolve_file(run_dir, manifest.get("model_a_generated"))
+        model_b_generated = resolve_file(run_dir, manifest.get("model_b_generated"))
+        if model_a_generated:
+            single_generated.setdefault(model_a, model_a_generated)
+        if model_b_generated:
+            single_generated.setdefault(model_b, model_b_generated)
+        debate_results = run_dir / "debate" / "debate_results.jsonl"
+        if not debate_results.exists():
+            debate_results = resolve_file(run_dir, manifest.get("debate_dir"))
+            if debate_results and debate_results.is_dir():
+                debate_results = debate_results / "debate_results.jsonl"
+        if debate_results and not debate_results.exists():
+            debate_results = None
+        consensus_jsonl = resolve_file(run_dir, manifest.get("consensus_jsonl"))
+        if consensus_jsonl is None:
+            candidates = sorted((run_dir / "debate").glob("consensus_*.jsonl"))
+            consensus_jsonl = candidates[0] if candidates else None
+        consensus_eval = run_dir / "consensus_eval" / "evaluation_results.jsonl"
+        if not consensus_eval.exists():
+            consensus_eval = None
+        runs.append(
+            RunArtifacts(
+                source_root=source_root,
+                run_dir=run_dir,
+                dataset=dataset,
+                model_a=model_a,
+                model_b=model_b,
+                single_generated=single_generated,
+                debate_results=debate_results,
+                consensus_jsonl=consensus_jsonl,
+                consensus_eval=consensus_eval,
+                manifest_path=manifest_path if manifest_path.exists() else None,
+            )
+        )
+    return runs
+def load_existing_case_signatures(cases_file: Path) -> set[str]:
+    signatures: set[str] = set()
+    if not cases_file.exists():
+        return signatures
+    with cases_file.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            meta = row.get("metadata") or {}
+            for key in ("import_signature", "debate_signature"):
+                value = meta.get(key)
+                if value:
+                    signatures.add(str(value))
+    return signatures
+def load_existing_debug_signatures(debug_memory_file: Path) -> set[str]:
+    signatures: set[str] = set()
+    if not debug_memory_file.exists():
+        return signatures
+    with debug_memory_file.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            signature = row.get("signature")
+            if signature:
+                signatures.add(str(signature))
+    return signatures
+def summarize_rounds(rounds: List[Dict], max_chars: int = 1800) -> str:
+    if not rounds:
+        return ""
+    lines: List[str] = []
+    for rnd in rounds:
+        lines.append(
+            f"Round {rnd.get('round')}: "
+            f"A={rnd.get('result_A')} ({rnd.get('status_A')}), "
+            f"B={rnd.get('result_B')} ({rnd.get('status_B')})"
+        )
+        analysis_a = (rnd.get("analysis_A") or "").strip()
+        analysis_b = (rnd.get("analysis_B") or "").strip()
+        if analysis_a:
+            lines.append(f"Model A analysis:\n{analysis_a}")
+        if analysis_b:
+            lines.append(f"Model B analysis:\n{analysis_b}")
+        lines.append("")
+    text = "\n".join(lines).strip()
+    if len(text) <= max_chars:
+        return text
+    return text[: max_chars - 16] + "\n...\n(truncated)"
+def heuristic_debate_summary(entry: Dict, model_a: str, model_b: str) -> Dict:
+    initial_a = entry.get("initial_A_result")
+    initial_b = entry.get("initial_B_result")
+    final_result = entry.get("final_result")
+    chosen_model = entry.get("chosen_model") or "consensus"
+    rounds = entry.get("debate_rounds") or []
+    summary = (
+        f"Initial mismatch: {model_a}={initial_a}, {model_b}={initial_b}. "
+        f"Debate converged in {len(rounds)} rounds and selected {chosen_model} "
+        f"with final objective {final_result}."
+    )
+    decisive_argument = (
+        f"The final candidate from {chosen_model} was retained after both sides "
+        "aligned on the same executable outcome."
+    )
+    guardrails = [
+        "Compare feasibility and objective values before rewriting the model.",
+        "Keep a stable executable candidate whenever later edits do not improve the result.",
+    ]
+    return {
+        "summary": summary,
+        "mismatch_reason": "The two models initially disagreed on the objective value or feasibility.",
+        "decisive_argument": decisive_argument,
+        "guardrails": guardrails,
+        "modeling_patterns": [],
+        "history_excerpt": summarize_rounds(rounds),
+    }
+def guidance_for_status(status: str) -> str:
+    status = (status or "").strip()
+    if status == "no_code":
+        return "Return a complete executable Python program inside a ```python``` block."
+    if status == "success_no_objective":
+        return "Print the optimized objective explicitly, for example with OBJECTIVE_VALUE after optimize()."
+    if status == "timeout":
+        return "Reduce model-construction overhead and check whether loops or constraints are exploding combinatorially."
+    if status == "not_executed":
+        return "Make sure the generated response contains runnable code and that the execution step is actually triggered."
+    return "Check imports, indexing, variable names, and model-object references against the traceback."
+def has_disagreement(initial_a: Optional[float], initial_b: Optional[float], tolerance: float) -> bool:
+    if initial_a is None or initial_b is None:
+        return True
+    return abs(initial_a - initial_b) > tolerance
+def choose_error_text(row: Dict) -> str:
+    stderr = (row.get("execution_stderr") or "").strip()
+    stdout = (row.get("execution_stdout") or "").strip()
+    status = (row.get("execution_status") or row.get("status") or "").strip()
+    if stderr:
+        return stderr
+    if stdout:
+        return stdout
+    if status == "no_code":
+        return "Generated code block is empty."
+    if status == "not_executed":
+        return "Execution did not complete and no detailed stderr/stdout was recorded."
+    if status == "success_no_objective":
+        return "Execution succeeded but no objective value could be extracted from stdout."
+    return status or "Unknown execution issue."
+def clone_base_memory_dirs(base_root: Path, variant_dir: Path) -> Dict[str, Path]:
+    mapping = {}
+    for dirname in (MAIN_MEMORY_DIRNAME, DEBUG_CASE_MEMORY_DIRNAME, DEBATE_MEMORY_DIRNAME):
+        src = base_root / dirname
+        dst = variant_dir / dirname
+        shutil.copytree(src, dst)
+        mapping[dirname] = dst
+    return mapping
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Create augmented memory-bank variants from standalone pipeline runs without touching originals."
+    )
+    parser.add_argument(
+        "--variant_name",
+        type=str,
+        required=True,
+        help="Name of the output variant directory under memory_variants/",
+    )
+    parser.add_argument(
+        "--source",
+        nargs="+",
+        required=True,
+        help="Source directories or glob patterns under standalone_pipeline/runs.",
+    )
+    parser.add_argument(
+        "--base_root",
+        type=str,
+        default=str(DEFAULT_BASE_ROOT),
+        help="Project root that contains memory_storage/debug_case_memory/debate_memory_storage.",
+    )
+    parser.add_argument(
+        "--variants_root",
+        type=str,
+        default=str(DEFAULT_VARIANTS_ROOT),
+        help="Directory under which new variants are created.",
+    )
+    parser.add_argument(
+        "--embedding_model",
+        type=str,
+        default="BAAI/bge-small-en-v1.5",
+        help="Embedding model name or local path used when updating vector indexes.",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=0.05,
+        help="Correctness tolerance for imported single-model cases.",
+    )
+    parser.add_argument(
+        "--mismatch_tolerance",
+        type=float,
+        default=1e-3,
+        help="Minimum difference between initial debate results to count as a disagreement.",
+    )
+    parser.add_argument(
+        "--use_relative_tolerance",
+        action="store_true",
+        help="Use relative tolerance when judging single-model correctness.",
+    )
+    args = parser.parse_args()
+    base_root = Path(args.base_root).resolve()
+    variants_root = Path(args.variants_root).resolve()
+    source_roots = resolve_source_roots(args.source)
+    if not source_roots:
+        raise FileNotFoundError(f"No source roots matched: {args.source}")
+    variant_dir = variants_root / args.variant_name
+    if variant_dir.exists():
+        raise FileExistsError(f"Variant already exists: {variant_dir}")
+    variant_dir.parent.mkdir(parents=True, exist_ok=True)
+    print("=== Augment Standalone Memory Banks ===")
+    print(f"Base root:      {base_root}")
+    print(f"Variant dir:    {variant_dir}")
+    print(f"Source roots:   {len(source_roots)}")
+    for root in source_roots:
+        print(f"  - {root}")
+    memory_dirs = clone_base_memory_dirs(base_root, variant_dir)
+    main_memory_dir = memory_dirs[MAIN_MEMORY_DIRNAME]
+    debug_case_memory_dir = memory_dirs[DEBUG_CASE_MEMORY_DIRNAME]
+    debate_memory_dir = memory_dirs[DEBATE_MEMORY_DIRNAME]
+    debug_memory_file = main_memory_dir / DEBUG_MEMORY_FILENAME
+    main_seen = load_existing_case_signatures(main_memory_dir / "cases.jsonl")
+    debug_case_seen = load_existing_case_signatures(debug_case_memory_dir / "cases.jsonl")
+    debate_seen = load_existing_case_signatures(debate_memory_dir / "cases.jsonl")
+    debug_raw_seen = load_existing_debug_signatures(debug_memory_file)
+    main_appender = BatchMemoryAppender(main_memory_dir, args.embedding_model)
+    debug_case_appender = BatchMemoryAppender(debug_case_memory_dir, args.embedding_model)
+    debate_appender = BatchMemoryAppender(debate_memory_dir, args.embedding_model)
+    pending_debug_rows: List[Dict] = []
+    stats = {
+        "runs": {
+            "source_roots": len(source_roots),
+            "runs_discovered": 0,
+            "runs_with_manifest": 0,
+            "runs_with_complete_debate": 0,
+            "runs_partial_or_single_only": 0,
+        },
+        "memory_storage": {
+            "single_correct_added": 0,
+            "consensus_correct_added": 0,
+            "duplicates_skipped": 0,
+            "incorrect_or_missing_single_skipped": 0,
+            "consensus_missing_code_or_eval_skipped": 0,
+        },
+        "debug_memory": {
+            "raw_records_added": 0,
+            "case_records_added": 0,
+            "duplicates_skipped": 0,
+            "non_failure_skipped": 0,
+            "missing_reference_skipped": 0,
+        },
+        "debate_memory": {
+            "added": 0,
+            "duplicates_skipped": 0,
+            "missing_or_incorrect_skipped": 0,
+        },
+    }
+    all_runs: List[RunArtifacts] = []
+    for source_root in source_roots:
+        all_runs.extend(discover_run_artifacts(source_root))
+    stats["runs"]["runs_discovered"] = len(all_runs)
+    stats["runs"]["runs_with_manifest"] = sum(1 for run in all_runs if run.manifest_path)
+    stats["runs"]["runs_with_complete_debate"] = sum(1 for run in all_runs if run.has_complete_debate)
+    stats["runs"]["runs_partial_or_single_only"] = stats["runs"]["runs_discovered"] - stats["runs"]["runs_with_complete_debate"]
+    for run in all_runs:
+        print(f"Processing run: {run.run_dir}")
+        single_rows_by_model: Dict[str, Dict[int, Dict]] = {}
+        correct_single_refs: Dict[int, Dict[str, ReferenceSolution]] = {}
+        for model_name, generated_path in sorted(run.single_generated.items()):
+            rows_map: Dict[int, Dict] = {}
+            for row in load_jsonl(generated_path):
+                problem_id = row.get("id")
+                if problem_id is None:
+                    continue
+                try:
+                    problem_id = int(problem_id)
+                except (TypeError, ValueError):
+                    continue
+                rows_map[problem_id] = row
+                code = (row.get("generated_code") or "").strip()
+                pred = float_or_none(row.get("execution_objective_value"))
+                gt = float_or_none(row.get("answer"))
+                is_correct = bool(code) and check_correctness(
+                    pred,
+                    gt,
+                    tolerance=args.tolerance,
+                    use_relative_tolerance=args.use_relative_tolerance,
+                )
+                if not is_correct:
+                    stats["memory_storage"]["incorrect_or_missing_single_skipped"] += 1
+                    continue
+                description = clean_description(row.get("description", ""))
+                signature_basis = (
+                    f"main|single|{run.dataset}|{problem_id}|{model_name}|"
+                    f"{sha1_short(code, 20)}|{pred}"
+                )
+                import_signature = f"standalone-main:{sha1_short(signature_basis, 20)}"
+                if import_signature in main_seen:
+                    stats["memory_storage"]["duplicates_skipped"] += 1
+                    continue
+                metadata = {
+                    "source": "standalone_single_generated",
+                    "dataset": run.dataset,
+                    "run_dir": str(run.run_dir),
+                    "run_name": run.run_dir.name,
+                    "source_root": str(run.source_root),
+                    "model": model_name,
+                    "execution_status": row.get("execution_status", "unknown"),
+                    "ground_truth": row.get("answer"),
+                    "case_kind": "single",
+                    "import_signature": import_signature,
+                }
+                main_appender.add_case(
+                    problem_id=problem_id,
+                    problem_desc=description,
+                    solution_code=code,
+                    objective_value=pred if pred is not None else 0.0,
+                    metadata=metadata,
+                )
+                main_seen.add(import_signature)
+                stats["memory_storage"]["single_correct_added"] += 1
+                correct_single_refs.setdefault(problem_id, {})[model_name] = ReferenceSolution(
+                    source="single",
+                    model=model_name,
+                    code=code,
+                    objective_value=pred,
+                    chosen_model=model_name,
+                )
+            single_rows_by_model[model_name] = rows_map
+        consensus_rows_by_id: Dict[int, Dict] = {}
+        debate_rows_by_id: Dict[int, Dict] = {}
+        eval_rows_by_id: Dict[int, Dict] = {}
+        consensus_refs: Dict[int, ReferenceSolution] = {}
+        if run.has_complete_debate:
+            for row in load_jsonl(run.consensus_jsonl):
+                problem_id = row.get("id")
+                if problem_id is None:
+                    continue
+                try:
+                    consensus_rows_by_id[int(problem_id)] = row
+                except (TypeError, ValueError):
+                    continue
+            for row in load_jsonl(run.debate_results):
+                problem_id = row.get("problem_id")
+                if problem_id is None:
+                    continue
+                try:
+                    debate_rows_by_id[int(problem_id)] = row
+                except (TypeError, ValueError):
+                    continue
+            for row in load_jsonl(run.consensus_eval):
+                problem_id = row.get("id")
+                if problem_id is None:
+                    continue
+                try:
+                    eval_rows_by_id[int(problem_id)] = row
+                except (TypeError, ValueError):
+                    continue
+            for problem_id, eval_row in eval_rows_by_id.items():
+                if not eval_row.get("is_correct", False):
+                    stats["memory_storage"]["consensus_missing_code_or_eval_skipped"] += 1
+                    continue
+                consensus_row = consensus_rows_by_id.get(problem_id, {})
+                debate_row = debate_rows_by_id.get(problem_id, {})
+                code = (consensus_row.get("generated_code") or debate_row.get("final_code") or "").strip()
+                if not code:
+                    stats["memory_storage"]["consensus_missing_code_or_eval_skipped"] += 1
+                    continue
+                description = clean_description(
+                    consensus_row.get("description")
+                    or next(
+                        (
+                            model_rows[problem_id].get("description")
+                            for model_rows in single_rows_by_model.values()
+                            if problem_id in model_rows
+                        ),
+                        f"{run.dataset} problem {problem_id}",
+                    )
+                )
+                pred = float_or_none(eval_row.get("predicted_objective"))
+                signature_basis = (
+                    f"main|consensus|{run.dataset}|{problem_id}|"
+                    f"{sha1_short(code, 20)}|{pred}"
+                )
+                import_signature = f"standalone-main:{sha1_short(signature_basis, 20)}"
+                if import_signature in main_seen:
+                    stats["memory_storage"]["duplicates_skipped"] += 1
+                else:
+                    metadata = {
+                        "source": "standalone_consensus_eval",
+                        "dataset": run.dataset,
+                        "run_dir": str(run.run_dir),
+                        "run_name": run.run_dir.name,
+                        "source_root": str(run.source_root),
+                        "modelA": run.model_a,
+                        "modelB": run.model_b,
+                        "chosen_model": debate_row.get("chosen_model") or consensus_row.get("chosen_model"),
+                        "execution_status": eval_row.get("execution_status", "unknown"),
+                        "ground_truth": eval_row.get("ground_truth"),
+                        "case_kind": "consensus",
+                        "import_signature": import_signature,
+                    }
+                    main_appender.add_case(
+                        problem_id=problem_id,
+                        problem_desc=description,
+                        solution_code=code,
+                        objective_value=pred if pred is not None else 0.0,
+                        metadata=metadata,
+                    )
+                    main_seen.add(import_signature)
+                    stats["memory_storage"]["consensus_correct_added"] += 1
+                consensus_refs[problem_id] = ReferenceSolution(
+                    source="consensus",
+                    model="debate_consensus",
+                    code=code,
+                    objective_value=pred,
+                    chosen_model=debate_row.get("chosen_model") or consensus_row.get("chosen_model"),
+                )
+            for problem_id, debate_row in debate_rows_by_id.items():
+                eval_row = eval_rows_by_id.get(problem_id)
+                if not eval_row or not eval_row.get("is_correct", False):
+                    stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
+                    continue
+                if not debate_row.get("converged"):
+                    stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
+                    continue
+                initial_a = float_or_none(debate_row.get("initial_A_result"))
+                initial_b = float_or_none(debate_row.get("initial_B_result"))
+                if not has_disagreement(initial_a, initial_b, args.mismatch_tolerance):
+                    stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
+                    continue
+                final_code = (debate_row.get("final_code") or "").strip()
+                if not final_code:
+                    stats["debate_memory"]["missing_or_incorrect_skipped"] += 1
+                    continue
+                base_desc = clean_description(
+                    consensus_rows_by_id.get(problem_id, {}).get("description")
+                    or next(
+                        (
+                            model_rows[problem_id].get("description")
+                            for model_rows in single_rows_by_model.values()
+                            if problem_id in model_rows
+                        ),
+                        f"{run.dataset} problem {problem_id}",
+                    )
+                )
+                summary_payload = heuristic_debate_summary(debate_row, run.model_a, run.model_b)
+                full_desc = (
+                    f"{base_desc}\n\n# Debate Memory Summary\n"
+                    f"{summary_payload.get('summary', '').strip()}"
+                ).strip()
+                debate_signature = (
+                    f"standalone-debate:{run.dataset}:{problem_id}:{sha1_short(final_code, 20)}"
+                )
+                if debate_signature in debate_seen:
+                    stats["debate_memory"]["duplicates_skipped"] += 1
+                    continue
+                metadata = {
+                    "source": "standalone_debate_memory_import",
+                    "dataset": run.dataset,
+                    "run_dir": str(run.run_dir),
+                    "run_name": run.run_dir.name,
+                    "source_root": str(run.source_root),
+                    "modelA": run.model_a,
+                    "modelB": run.model_b,
+                    "initial_A_result": initial_a,
+                    "initial_B_result": initial_b,
+                    "ground_truth": eval_row.get("ground_truth"),
+                    "debate_signature": debate_signature,
+                    "import_signature": debate_signature,
+                    "summary": summary_payload,
+                }
+                debate_appender.add_case(
+                    problem_id=problem_id,
+                    problem_desc=full_desc,
+                    solution_code=final_code,
+                    objective_value=float_or_none(debate_row.get("final_result")) or 0.0,
+                    metadata=metadata,
+                )
+                debate_seen.add(debate_signature)
+                stats["debate_memory"]["added"] += 1
+        for model_name, rows_map in sorted(single_rows_by_model.items()):
+            for problem_id, row in rows_map.items():
+                status = row.get("execution_status") or row.get("status") or ""
+                if status not in DEBUG_FAILURE_STATUSES:
+                    stats["debug_memory"]["non_failure_skipped"] += 1
+                    continue
+                reference: Optional[ReferenceSolution] = None
+                for other_model, ref in sorted(correct_single_refs.get(problem_id, {}).items()):
+                    if other_model != model_name:
+                        reference = ref
+                        break
+                if reference is None:
+                    reference = consensus_refs.get(problem_id)
+                if reference is None:
+                    stats["debug_memory"]["missing_reference_skipped"] += 1
+                    continue
+                description = clean_description(row.get("description", ""))
+                error_text = choose_error_text(row)
+                guidance = (
+                    f"{guidance_for_status(status)} "
+                    f"Reference fix source: {reference.source} ({reference.model}); "
+                    f"target objective: {reference.objective_value}."
+                )
+                import_signature = (
+                    f"standalone-debug:{sha1_short(f'{run.dataset}|{problem_id}|{model_name}|{status}|{error_text}|{sha1_short(reference.code, 16)}', 20)}"
+                )
+                if import_signature in debug_case_seen or import_signature in debug_raw_seen:
+                    stats["debug_memory"]["duplicates_skipped"] += 1
+                    continue
+                debug_record = {
+                    "signature": import_signature,
+                    "status": status,
+                    "error_text": error_text,
+                    "guidance": guidance,
+                    "problem_id": problem_id,
+                    "description": description,
+                    "metadata": {
+                        "source": "standalone_runs.synthetic_debug_case",
+                        "dataset": run.dataset,
+                        "run_dir": str(run.run_dir),
+                        "run_name": run.run_dir.name,
+                        "source_root": str(run.source_root),
+                        "model": model_name,
+                        "reference_source": reference.source,
+                        "reference_model": reference.model,
+                        "reference_objective": reference.objective_value,
+                        "reference_chosen_model": reference.chosen_model,
+                    },
+                    "timestamp": now_iso(),
+                }
+                pending_debug_rows.append(debug_record)
+                debug_raw_seen.add(import_signature)
+                prompt_desc = (
+                    f"{description}\n\n"
+                    f"## Error Details\n```\n{error_text}\n```\n"
+                    f"## Guidance\n{guidance}\n"
+                )
+                reference_code = reference.code.strip()
+                solution_code = (
+                    "# Synthetic Debug Memory Case\n"
+                    f"# Signature: {import_signature}\n"
+                    f"# Status: {status}\n"
+                    f"# Reference source: {reference.source} ({reference.model})\n\n"
+                    f"{reference_code}"
+                )
+                metadata = {
+                    "source": "standalone_runs.synthetic_debug_case",
+                    "dataset": run.dataset,
+                    "run_dir": str(run.run_dir),
+                    "run_name": run.run_dir.name,
+                    "source_root": str(run.source_root),
+                    "model": model_name,
+                    "status": status,
+                    "signature": import_signature,
+                    "reference_source": reference.source,
+                    "reference_model": reference.model,
+                    "reference_objective": reference.objective_value,
+                    "reference_chosen_model": reference.chosen_model,
+                    "import_signature": import_signature,
+                }
+                debug_case_appender.add_case(
+                    problem_id=problem_id,
+                    problem_desc=prompt_desc,
+                    solution_code=solution_code,
+                    objective_value=0.0,
+                    metadata=metadata,
+                )
+                debug_case_seen.add(import_signature)
+                stats["debug_memory"]["raw_records_added"] += 1
+                stats["debug_memory"]["case_records_added"] += 1
+    append_jsonl(debug_memory_file, pending_debug_rows)
+    main_added = main_appender.finalize()
+    debug_case_added = debug_case_appender.finalize()
+    debate_added = debate_appender.finalize()
+    summary = {
+        "created_at": now_iso(),
+        "variant_dir": str(variant_dir),
+        "base_root": str(base_root),
+        "source_patterns": list(args.source),
+        "resolved_source_roots": [str(path) for path in source_roots],
+        "embedding_model": args.embedding_model,
+        "tolerance": args.tolerance,
+        "use_relative_tolerance": args.use_relative_tolerance,
+        "mismatch_tolerance": args.mismatch_tolerance,
+        "stats": stats,
+        "final_counts": {
+            "memory_storage_cases": count_jsonl_lines(main_memory_dir / "cases.jsonl"),
+            "debug_memory_records": count_jsonl_lines(debug_memory_file),
+            "debug_case_memory_cases": count_jsonl_lines(debug_case_memory_dir / "cases.jsonl"),
+            "debate_memory_cases": count_jsonl_lines(debate_memory_dir / "cases.jsonl"),
+            "main_added_persisted": main_added,
+            "debug_case_added_persisted": debug_case_added,
+            "debate_added_persisted": debate_added,
+        },
+    }
+    dump_json(variant_dir / "import_summary.json", summary)
+    print("=== Import Complete ===")
+    print(f"Variant:                 {variant_dir}")
+    print(f"Main memory added:       {main_added}")
+    print(f"Debug raw added:         {len(pending_debug_rows)}")
+    print(f"Debug case added:        {debug_case_added}")
+    print(f"Debate memory added:     {debate_added}")
+    print(f"Summary:                 {variant_dir / 'import_summary.json'}")
+if __name__ == "__main__":
+    main()

src/debate_memory/build_memory_from_eval_results.py ADDED Viewed

	@@ -0,0 +1,293 @@

+#!/usr/bin/env python3
+"""
+Build solution memory from evaluation result directories.
+Any evaluation directory can be used as input as long as it contains both
+`evaluation_results.jsonl` and a `code/` directory. The script extracts problem
+descriptions, executable code, and objective values from correct cases and
+writes them into the solution-memory store.
+"""
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional
+from .config import find_benchmark_path, get_benchmark_dirs, normalize_dataset_name
+from .memory_bank import MemoryBank
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent.parent
+DEFAULT_BENCHMARKS_DIR = get_benchmark_dirs(PROJECT_ROOT)[0]
+def load_evaluation_results(eval_file: str) -> Dict[int, Dict]:
+    """Load evaluation results as `{id: {...}}`."""
+    results = {}
+    if not os.path.exists(eval_file):
+        print(f"Warning: evaluation result file not found: {eval_file}")
+        return results
+    with open(eval_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                data = json.loads(line)
+                results[data['id']] = data
+    return results
+def load_benchmark_data(benchmark_file: str) -> Dict[int, Dict]:
+    """Load benchmark data as `{id: {...}}`."""
+    data = {}
+    if not os.path.exists(benchmark_file):
+        print(f"Warning: benchmark file not found: {benchmark_file}")
+        return data
+    with open(benchmark_file, 'r', encoding='utf-8') as f:
+        for idx, line in enumerate(f):
+            if line.strip():
+                item = json.loads(line)
+                # Prefer an explicit id field, otherwise fall back to the line index.
+                problem_id = item.get('id', item.get('problem_id', idx))
+                data[problem_id] = item
+    return data
+def load_solution_code(code_file: str) -> Optional[str]:
+    """Load a solution code file."""
+    if not os.path.exists(code_file):
+        return None
+    try:
+        with open(code_file, 'r', encoding='utf-8') as f:
+            return f.read()
+    except Exception as e:
+        print(f"Warning: failed to read code file {code_file}: {e}")
+        return None
+def extract_dataset_name(eval_dir: str) -> Optional[str]:
+    """
+    Extract the dataset name from an evaluation directory name.
+    Example:
+        `deepseek-chat_EasyLP_clean_eval_20251024_120712.jsonl` -> `EasyLP`
+    """
+    dir_name = os.path.basename(eval_dir)
+    # Remove the .jsonl suffix if present.
+    if dir_name.endswith('.jsonl'):
+        dir_name = dir_name[:-6]
+    # Remove the model name and timestamp.
+    parts = dir_name.split('_')
+    # Locate the `eval` marker.
+    try:
+        eval_idx = parts.index('eval')
+        # The dataset name should appear before `eval`, after the model name.
+        dataset_parts = parts[:eval_idx]
+        if len(dataset_parts) > 1:
+            return normalize_dataset_name('_'.join(dataset_parts[1:]))
+        else:
+            return normalize_dataset_name(dataset_parts[0]) if dataset_parts else None
+    except ValueError:
+        # Fallback for names of the form model_dataset_timestamp.
+        if len(parts) >= 3:
+            return normalize_dataset_name('_'.join(parts[1:-1]))
+        return None
+def build_memory_from_eval_result(eval_result_dir: str, benchmarks_dir: str, memory_bank: MemoryBank):
+    """
+    Build memory from a single evaluation result directory.
+    Args:
+        eval_result_dir: Directory containing `evaluation_results.jsonl` and `code/`.
+        benchmarks_dir: Benchmark dataset directory.
+        memory_bank: MemoryBank instance.
+    """
+    eval_file = os.path.join(eval_result_dir, 'evaluation_results.jsonl')
+    code_dir = os.path.join(eval_result_dir, 'code')
+    if not os.path.exists(eval_file):
+        print(f"Warning: skipping {eval_result_dir}: evaluation_results.jsonl not found")
+        return 0, 0
+    # Extract the dataset name.
+    dataset_name = extract_dataset_name(eval_result_dir)
+    if not dataset_name:
+        print(f"Warning: skipping {eval_result_dir}: failed to extract dataset name")
+        return 0, 0
+    benchmark_file = os.path.join(benchmarks_dir, f"{dataset_name}.jsonl")
+    if not os.path.exists(benchmark_file):
+        try:
+            benchmark_file = str(find_benchmark_path(PROJECT_ROOT, dataset_name))
+        except FileNotFoundError:
+            pass
+    if not os.path.exists(benchmark_file):
+        print(f"Warning: skipping {eval_result_dir}: benchmark file not found: {benchmark_file}")
+        return 0, 0
+    print(f"Processing dataset: {dataset_name}")
+    print(f"  evaluation results: {eval_file}")
+    print(f"  benchmark file: {benchmark_file}")
+    print(f"  code directory: {code_dir}")
+    # Load all required inputs.
+    eval_results = load_evaluation_results(eval_file)
+    benchmark_data = load_benchmark_data(benchmark_file)
+    added_count = 0
+    skipped_count = 0
+    # Process each correct case.
+    for problem_id, eval_result in eval_results.items():
+        # Only keep correct cases.
+        if not eval_result.get('is_correct', False):
+            skipped_count += 1
+            continue
+        # Recover the problem description.
+        if problem_id not in benchmark_data:
+            print(f"  Warning: skipping ID {problem_id}: missing from benchmark file")
+            skipped_count += 1
+            continue
+        benchmark_item = benchmark_data[problem_id]
+        # Support both `description` and `en_question`.
+        description = benchmark_item.get('description', '') or benchmark_item.get('en_question', '')
+        if not description:
+            print(f"  Warning: skipping ID {problem_id}: missing problem description")
+            skipped_count += 1
+            continue
+        # Load the solution code.
+        code_file = os.path.join(code_dir, f"problem_{problem_id}.py")
+        solution_code = load_solution_code(code_file)
+        if not solution_code:
+            print(f"  Warning: skipping ID {problem_id}: code file missing or unreadable")
+            skipped_count += 1
+            continue
+        # Recover the objective value.
+        objective_value = eval_result.get('predicted_objective')
+        if objective_value is None:
+            # Fall back to the benchmark answer fields if needed.
+            answer_str = benchmark_item.get('answer', '') or benchmark_item.get('en_answer', '')
+            try:
+                objective_value = float(answer_str)
+            except:
+                print(f"  Warning: skipping ID {problem_id}: objective value unavailable")
+                skipped_count += 1
+                continue
+        # Build metadata for the stored case.
+        ground_truth = benchmark_item.get('answer', '') or benchmark_item.get('en_answer', '')
+        metadata = {
+            'source': 'eval_results',
+            'dataset': dataset_name,
+            'eval_dir': os.path.basename(eval_result_dir),
+            'execution_status': eval_result.get('execution_status', 'unknown'),
+            'ground_truth': ground_truth,
+        }
+        # Do not deduplicate across datasets; the same problem_id may appear in multiple benchmarks.
+        # Add the case to the memory bank.
+        try:
+            memory_bank.add_case(
+                problem_id=problem_id,
+                problem_desc=description,
+                solution_code=solution_code,
+                objective_value=float(objective_value),
+                is_correct=True,
+                metadata=metadata
+            )
+            added_count += 1
+        except Exception as e:
+            print(f"  Error: failed to add ID {problem_id}: {e}")
+            skipped_count += 1
+    print(f"  added cases: {added_count}")
+    print(f"  skipped cases: {skipped_count}")
+    print()
+    return added_count, skipped_count
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Build a memory bank from evaluation results")
+    parser.add_argument('--eval_dirs', type=str, nargs='+', required=True,
+                       help='Evaluation result directories containing evaluation_results.jsonl and code/')
+    parser.add_argument('--benchmarks_dir', type=str,
+                       default=str(DEFAULT_BENCHMARKS_DIR),
+                       help='Benchmark dataset directory')
+    parser.add_argument('--memory_dir', type=str,
+                       default=str(PROJECT_ROOT / "memory_storage"),
+                       help='Memory storage directory')
+    parser.add_argument('--clear', action='store_true',
+                       help='Clear the existing memory store before building')
+    args = parser.parse_args()
+    # Validate input directories.
+    if not os.path.exists(args.benchmarks_dir):
+        print(f"Error: benchmark directory does not exist: {args.benchmarks_dir}")
+        sys.exit(1)
+    # Clear the memory store if requested.
+    if args.clear:
+        if os.path.exists(args.memory_dir):
+            import shutil
+            print(f"Clearing existing memory store: {args.memory_dir}")
+            shutil.rmtree(args.memory_dir)
+            print()
+    # Initialize the memory bank.
+    print("="*70)
+    print("Building Memory Bank from Evaluation Results")
+    print("="*70)
+    print()
+    memory_bank = MemoryBank(memory_dir=args.memory_dir)
+    print(f"Current memory size: {memory_bank.case_count} cases")
+    print()
+    # Process each evaluation directory.
+    total_added = 0
+    total_skipped = 0
+    for eval_dir in args.eval_dirs:
+        if not os.path.exists(eval_dir):
+            print(f"Warning: skipping missing directory: {eval_dir}")
+            continue
+        added, skipped = build_memory_from_eval_result(
+            eval_dir, args.benchmarks_dir, memory_bank
+        )
+        total_added += added
+        total_skipped += skipped
+    # Refresh the case count.
+    memory_bank.case_count = memory_bank._count_cases()
+    print("="*70)
+    print("Memory Bank Build Complete")
+    print("="*70)
+    print(f"Total added: {total_added} cases")
+    print(f"Total skipped: {total_skipped} cases")
+    print(f"Final memory size: {memory_bank.case_count} cases")
+    print()
+    print(f"Memory location: {args.memory_dir}")
+    print(f"   - cases.jsonl: {os.path.join(args.memory_dir, 'cases.jsonl')}")
+    print(f"   - index/: {os.path.join(args.memory_dir, 'index')}")
+    print("="*70)
+if __name__ == "__main__":
+    main()

src/debate_memory/config.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Configuration file for simple RAG evaluation
+Contains prompt templates and other settings
+"""
+from pathlib import Path
+# ============================================
+# Prompt Templates
+# ============================================
+# Default Gurobi prompt template
+GUROBI_PROMPT = {
+    "system": """You are a helpful Assistant with expertise in mathematical modeling and the Gurobi solver. When the User provides an OR question, you will analyze it, build a detailed mathematical model, and provide the Gurobi code to solve it.
+Your response should follow these steps:
+1. Carefully analyze the problem to identify decision variables, objective, and constraints.
+2. Develop a complete mathematical model, explicitly defining:
+    - Sets
+    - Parameters
+    - Decision Variables (and their types)
+    - Objective Function
+    - Constraints
+3. Provide the corresponding Gurobi Python code to implement the model.
+Implementation guardrails:
+- Use `gurobipy` exclusively (avoid cvxpy/pulp/copty imports).
+- When indexing tupledict variables across periods, introduce an explicit sentinel index (e.g., period 0) for initial conditions instead of accessing undefined keys like `x[-1]`.
+- Define any Big-M constants explicitly using bounds derived from the data before they appear in constraints.
+- Keep the model linear/integer; if a relationship seems non-linear, introduce auxiliary variables and linearization rather than exponentiation or log constraints.
+- Always ensure every symbol referenced in constraints/objective (such as `M`, helper dictionaries, etc.) is declared in the code block.
+""",
+    "user": """Problem:
+{question}
+Provide a complete solution with mathematical model and Gurobi code.
+"""
+}
+# ============================================
+# Model Configuration
+# ============================================
+# Supported models and their default temperatures
+MODEL_CONFIGS = {
+    "gpt-4o": {"temperature": 0.01, "max_tokens": 8192},
+    "gpt-4o-mini": {"temperature": 0.01, "max_tokens": 8192},
+    "deepseek-chat": {"temperature": 0.01, "max_tokens": 8192},
+    "gemini-2.0-flash-exp": {"temperature": 0.01, "max_tokens": 8192},
+    "gemini-2.5-pro": {"temperature": 0.01, "max_tokens": 8192},
+}
+# ============================================
+# Evaluation Configuration
+# ============================================
+EVAL_CONFIG = {
+    # Execution settings
+    "timeout": 60,  # seconds
+    "max_retries": 3,
+    # Answer comparison settings
+    "tolerance": 0.05,  # 5% relative tolerance by default
+    "use_relative_tolerance": True,
+    "absolute_tolerance": 1e-3,  # for zero objective values
+    # Output settings
+    "save_code": True,
+    "save_output": False,  # whether to save stdout/stderr
+    "verbose": False,
+}
+# ============================================
+# Dataset Configuration
+# ============================================
+# Supported datasets
+DATASETS = [
+    "ComplexLP",
+    "EasyLP",
+    "IndustryOR",
+    "NL4OPT",
+    "NLP4LP",
+    "ReSocratic",
+    "ComplexOR",
+    "OPT-Principled",
+]
+DATASET_ALIASES = {
+    "complexlp_clean": "ComplexLP",
+    "easylp_clean": "EasyLP",
+    "industryor_clean": "IndustryOR",
+    "industryor_v2": "IndustryOR",
+    "industryor_fixedv2": "IndustryOR",
+    "industryor_fixedv2_clean": "IndustryOR",
+    "nl4opt": "NL4OPT",
+    "nl4opt_clean": "NL4OPT",
+    "nlp4lp_clean": "NLP4LP",
+    "complexor_clean": "ComplexOR",
+    "resocratic_clean": "ReSocratic",
+    "combined": "OPT-Principled",
+    "combined_dataset": "OPT-Principled",
+    "opt-principled_clean": "OPT-Principled",
+}
+# Dataset-specific settings (if needed)
+DATASET_CONFIG = {
+    "ComplexLP": {"tolerance": 0.05},
+    "EasyLP": {"tolerance": 0.01},
+    "IndustryOR": {"tolerance": 0.05},
+    "OPT-Principled": {"tolerance": 0.05},
+}
+# ============================================
+# Utility Functions
+# ============================================
+def get_prompt_template(template_name="default"):
+    """Get prompt template by name"""
+    templates = {
+        "default": GUROBI_PROMPT,
+    }
+    return templates.get(template_name, GUROBI_PROMPT)
+def get_model_config(model_name):
+    """Get configuration for a specific model"""
+    return MODEL_CONFIGS.get(model_name, {"temperature": 0.01, "max_tokens": 8192})
+def get_dataset_config(dataset_name):
+    """Get configuration for a specific dataset"""
+    return DATASET_CONFIG.get(normalize_dataset_name(dataset_name), {"tolerance": 0.05})
+def normalize_dataset_name(dataset_name: str) -> str:
+    """Map historical dataset names to the canonical OPEN benchmark names."""
+    if not dataset_name:
+        return dataset_name
+    name = dataset_name.strip()
+    if name.endswith(".jsonl"):
+        name = name[:-6]
+    alias = DATASET_ALIASES.get(name.casefold())
+    if alias:
+        return alias
+    for canonical_name in DATASETS:
+        if canonical_name.casefold() == name.casefold():
+            return canonical_name
+    if name.endswith("_clean"):
+        base_name = name[:-6]
+        for canonical_name in DATASETS:
+            if canonical_name.casefold() == base_name.casefold():
+                return canonical_name
+    return name
+def get_benchmark_dirs(project_root: Path) -> list[Path]:
+    """Return benchmark directories in priority order for the migrated OPEN layout."""
+    return [
+        project_root.parent.parent / "data" / "benchmarks",
+        project_root / "clean_benchmarks",
+        project_root.parent / "clean_benchmarks",
+    ]
+def find_benchmark_path(project_root: Path, dataset_name: str) -> Path:
+    """Locate the benchmark file for a dataset, accepting legacy names as input."""
+    normalized_name = normalize_dataset_name(dataset_name)
+    candidate_names = [normalized_name]
+    raw_name = dataset_name[:-6] if dataset_name.endswith(".jsonl") else dataset_name
+    if raw_name not in candidate_names:
+        candidate_names.append(raw_name)
+    for directory in get_benchmark_dirs(project_root):
+        for name in candidate_names:
+            candidate = directory / f"{name}.jsonl"
+            if candidate.exists():
+                return candidate
+    raise FileNotFoundError(
+        f"Dataset '{dataset_name}' not found. Checked directories: "
+        f"{[str(path) for path in get_benchmark_dirs(project_root)]}"
+    )

src/debate_memory/debate_memory_builder.py ADDED Viewed

	@@ -0,0 +1,477 @@

+"""
+Build a debate-specific memory bank from historical debate runs.
+This scans existing debate result directories such as
+`./results/Agora-Opt/debate/<dataset>/<timestamp>_<modelA>_vs_<modelB>/`
+directories, identifies problems where the two single generations disagreed yet
+the debate converged to a correct consensus, summarizes the key reconciliation
+insights (optionally via an LLM), and stores the cases inside a dedicated
+`MemoryBank` directory (default: ./debate_memory_storage).
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+from tqdm import tqdm
+from .llm import get_response
+from .memory_bank import MemoryBank
+PKG_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = PKG_DIR.parent.parent
+DEFAULT_RUNS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt" / "debate"
+DEFAULT_DEBATE_MEMORY_DIR = PROJECT_ROOT / "debate_memory_storage"
+@dataclass
+class DebateCaseInput:
+    dataset: str
+    problem_id: int
+    description: str
+    final_code: str
+    final_result: Optional[float]
+    debate_rounds: List[Dict]
+    modelA: str
+    modelB: str
+    run_dir: Path
+    ground_truth: Optional[str]
+    initial_A_result: Optional[float]
+    initial_B_result: Optional[float]
+    evaluation: Dict
+    metadata: Dict
+def load_jsonl(path: Path) -> List[Dict]:
+    if not path.exists():
+        return []
+    data: List[Dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return data
+def float_or_none(value) -> Optional[float]:
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (ValueError, TypeError):
+        return None
+def has_disagreement(entry: Dict, tolerance: float) -> bool:
+    a = float_or_none(entry.get("initial_A_result"))
+    b = float_or_none(entry.get("initial_B_result"))
+    if a is None or b is None:
+        return True
+    return abs(a - b) > tolerance
+def summarize_rounds(rounds: List[Dict], max_chars: int = 2000) -> str:
+    if not rounds:
+        return ""
+    lines: List[str] = []
+    for rnd in rounds:
+        round_idx = rnd.get("round")
+        res_a = rnd.get("result_A")
+        res_b = rnd.get("result_B")
+        status_a = rnd.get("status_A")
+        status_b = rnd.get("status_B")
+        analysis_a = (rnd.get("analysis_A") or "").strip()
+        analysis_b = (rnd.get("analysis_B") or "").strip()
+        lines.append(
+            f"Round {round_idx}: A={res_a} ({status_a}), B={res_b} ({status_b})"
+        )
+        if analysis_a:
+            lines.append(f"Model A analysis:\n{analysis_a}")
+        if analysis_b:
+            lines.append(f"Model B analysis:\n{analysis_b}")
+        lines.append("")
+    text = "\n".join(lines).strip()
+    if len(text) <= max_chars:
+        return text
+    return text[: max_chars - 200] + "\n...\n(truncated)"
+def build_summary_payload(
+    case: DebateCaseInput,
+    llm_model: Optional[str],
+    temperature: float,
+    llm_attempts: int = 1,
+) -> Dict:
+    history_text = summarize_rounds(case.debate_rounds)
+    default_summary = {
+        "summary": (
+            f"Initial mismatch: modelA={case.initial_A_result}, "
+            f"modelB={case.initial_B_result}. "
+            f"Debate converged in {len(case.debate_rounds)} rounds."
+        ),
+        "mismatch_reason": "",
+        "decisive_argument": "",
+        "guardrails": [],
+        "modeling_patterns": [],
+    }
+    if not llm_model:
+        return default_summary | {"history_excerpt": history_text}
+    prompt = f"""
+You are helping an optimisation-debate memory builder.
+Problem description:
+{case.description}
+Initial disagreement:
+- Model A result: {case.initial_A_result}
+- Model B result: {case.initial_B_result}
+- Ground truth (if known): {case.ground_truth}
+Debate transcript:
+{history_text}
+Final consensus objective: {case.final_result}
+Please return a JSON object with the following keys:
+- "summary": 2-3 sentences explaining how the debate resolved the mismatch.
+- "mismatch_reason": concise reason for the disagreement.
+- "decisive_argument": specific insight that convinced both sides.
+- "guardrails": list of actionable bullet points the next debater should follow.
+- "modeling_patterns": list of reusable modeling tricks/structures that appeared.
+JSON ONLY. No prose outside the JSON.
+""".strip()
+    attempts_remaining = max(1, llm_attempts)
+    last_error: Optional[Exception] = None
+    while attempts_remaining > 0:
+        try:
+            response = get_response(
+                prompt,
+                model=llm_model,
+                temperature=temperature,
+                maximum_retries=1,
+            )
+            payload = json.loads(response)
+            payload["history_excerpt"] = history_text
+            return payload
+        except Exception as exc:  # noqa: BLE001
+            last_error = exc
+            attempts_remaining -= 1
+    fallback = default_summary.copy()
+    failure_reason = f"{last_error}" if last_error else "LLM call failed"
+    fallback["summary"] += f" LLM summary failed: {failure_reason}"
+    fallback["history_excerpt"] = history_text
+    return fallback
+def existing_signatures(memory_dir: Path) -> set[str]:
+    cases_path = memory_dir / "cases.jsonl"
+    if not cases_path.exists():
+        return set()
+    signs: set[str] = set()
+    with cases_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            meta = data.get("metadata") or {}
+            sig = meta.get("debate_signature")
+            if sig:
+                signs.add(sig)
+    return signs
+class DebateMemoryBuilder:
+    def __init__(
+        self,
+        runs_root: Path,
+        memory_dir: Path,
+        mismatch_tolerance: float,
+        llm_model: Optional[str],
+        temperature: float,
+        llm_attempts: int,
+        max_workers: int,
+        datasets: Optional[Iterable[str]] = None,
+        dry_run: bool = False,
+    ) -> None:
+        self.runs_root = runs_root
+        self.memory_dir = memory_dir
+        self.mismatch_tolerance = mismatch_tolerance
+        self.llm_model = llm_model
+        self.temperature = temperature
+        self.llm_attempts = max(1, llm_attempts)
+        self.max_workers = max_workers
+        self.datasets_filter = {d.lower() for d in datasets} if datasets else None
+        self.dry_run = dry_run
+    def build(self) -> None:
+        candidates = self._collect_candidates()
+        if not candidates:
+            print("No qualifying debate cases found.")
+            return
+        if not self.memory_dir.exists() and not self.dry_run:
+            self.memory_dir.mkdir(parents=True, exist_ok=True)
+        seen_sigs = existing_signatures(self.memory_dir)
+        bank = None if self.dry_run else MemoryBank(memory_dir=str(self.memory_dir))
+        added = 0
+        skipped_duplicates = 0
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = {
+                executor.submit(self._summarize_case, case): case
+                for case in candidates
+            }
+            for future in tqdm(as_completed(futures), total=len(futures), desc="Summarizing"):
+                case = futures[future]
+                signature = f"{case.dataset}:{case.problem_id}:{case.run_dir.name}"
+                if signature in seen_sigs:
+                    skipped_duplicates += 1
+                    continue
+                summary_payload = future.result()
+                description = (
+                    f"{case.description.strip()}\n\n"
+                    f"# Debate Memory Summary\n"
+                    f"{summary_payload.get('summary', '').strip()}"
+                ).strip()
+                metadata = {
+                    "source": "debate_memory_builder",
+                    "dataset": case.dataset,
+                    "run_dir": str(case.run_dir),
+                    "modelA": case.modelA,
+                    "modelB": case.modelB,
+                    "initial_A_result": case.initial_A_result,
+                    "initial_B_result": case.initial_B_result,
+                    "ground_truth": case.ground_truth,
+                    "debate_signature": signature,
+                    "summary": summary_payload,
+                }
+                if self.dry_run:
+                    added += 1
+                    continue
+                try:
+                    bank.add_case(
+                        problem_id=case.problem_id,
+                        problem_desc=description,
+                        solution_code=case.final_code,
+                        objective_value=case.final_result or 0.0,
+                        is_correct=True,
+                        metadata=metadata,
+                    )
+                    added += 1
+                    seen_sigs.add(signature)
+                except Exception as exc:  # noqa: BLE001
+                    print(f"Failed to add case {signature}: {exc}")
+        print("===== Debate Memory Builder Summary =====")
+        print(f"Runs root:      {self.runs_root}")
+        print(f"Output dir:     {self.memory_dir}")
+        print(f"Total candidates: {len(candidates)}")
+        print(f"Added cases:      {added}")
+        print(f"Duplicates skipped: {skipped_duplicates}")
+        if self.dry_run:
+            print("Dry-run mode: no cases were written.")
+    def _collect_candidates(self) -> List[DebateCaseInput]:
+        candidates: List[DebateCaseInput] = []
+        if not self.runs_root.exists():
+            print(f"Runs root not found: {self.runs_root}")
+            return candidates
+        for dataset_dir in sorted(self.runs_root.iterdir()):
+            if not dataset_dir.is_dir():
+                continue
+            dataset_name = dataset_dir.name
+            if self.datasets_filter and dataset_name.lower() not in self.datasets_filter:
+                continue
+            for run_dir in sorted(dataset_dir.iterdir()):
+                if not run_dir.is_dir():
+                    continue
+                dataset_candidates = self._parse_run(dataset_name, run_dir)
+                candidates.extend(dataset_candidates)
+        return candidates
+    def _parse_run(self, dataset: str, run_dir: Path) -> List[DebateCaseInput]:
+        results_path = run_dir / "debate_results.jsonl"
+        if not results_path.exists():
+            return []
+        modelA, modelB = self._infer_models(run_dir.name)
+        consensus_path = next(run_dir.glob("consensus_*_vs_*.jsonl"), None)
+        consensus_records = load_jsonl(consensus_path) if consensus_path else []
+        desc_map = {int(rec["id"]): rec for rec in consensus_records if "id" in rec}
+        eval_path = run_dir / "eval_consensus" / "evaluation_results.jsonl"
+        evaluation_map = {
+            int(rec["id"]): rec for rec in load_jsonl(eval_path) if "id" in rec
+        }
+        run_candidates: List[DebateCaseInput] = []
+        for entry in load_jsonl(results_path):
+            problem_id = entry.get("problem_id")
+            if problem_id is None:
+                continue
+            problem_id = int(problem_id)
+            if not has_disagreement(entry, self.mismatch_tolerance):
+                continue
+            if not entry.get("converged"):
+                continue
+            evaluation = evaluation_map.get(problem_id)
+            desc_entry = desc_map.get(problem_id)
+            if desc_entry:
+                description = desc_entry.get("description") or f"{dataset} problem {problem_id}"
+            else:
+                description = f"Dataset {dataset} problem {problem_id}"
+            final_code = entry.get("final_code") or (
+                desc_entry.get("generated_code", "") if desc_entry else ""
+            )
+            if not final_code:
+                continue
+            debate_rounds = entry.get("debate_rounds") or []
+            if not debate_rounds:
+                continue
+            run_candidates.append(
+                DebateCaseInput(
+                    dataset=dataset,
+                    problem_id=problem_id,
+                    description=description,
+                    final_code=final_code,
+                    final_result=float_or_none(entry.get("final_result")),
+                    debate_rounds=debate_rounds,
+                    modelA=modelA,
+                    modelB=modelB,
+                    run_dir=run_dir,
+                    ground_truth=entry.get("ground_truth"),
+                    initial_A_result=float_or_none(entry.get("initial_A_result")),
+                    initial_B_result=float_or_none(entry.get("initial_B_result")),
+                    evaluation=evaluation or {},
+                    metadata={
+                        "run_dir": str(run_dir),
+                        "dataset": dataset,
+                    },
+                )
+            )
+        return run_candidates
+    @staticmethod
+    def _infer_models(run_name: str) -> Tuple[str, str]:
+        """
+        Run folder format: <timestamp>_<modelA>_vs_<modelB>
+        """
+        parts = run_name.split("_vs_")
+        if len(parts) != 2:
+            return "modelA", "modelB"
+        left = parts[0].split("_")  # timestamp + modelA pieces
+        if len(left) < 2:
+            return left[-1], parts[1]
+        modelA = "_".join(left[1:])
+        modelB = parts[1]
+        return modelA, modelB
+    def _summarize_case(self, case: DebateCaseInput) -> Dict:
+        return build_summary_payload(
+            case,
+            llm_model=self.llm_model,
+            temperature=self.temperature,
+            llm_attempts=self.llm_attempts,
+        )
+def parse_args():
+    parser = argparse.ArgumentParser(description="Build debate memory bank from historical runs.")
+    parser.add_argument(
+        "--runs_root",
+        type=str,
+        default=str(DEFAULT_RUNS_ROOT),
+        help="Directory containing debate run artifacts.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=str(DEFAULT_DEBATE_MEMORY_DIR),
+        help="Directory to store the debate memory bank.",
+    )
+    parser.add_argument(
+        "--datasets",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Optional dataset filters (case-insensitive).",
+    )
+    parser.add_argument(
+        "--mismatch_tolerance",
+        type=float,
+        default=1e-3,
+        help="Minimum absolute difference between initial results to consider a disagreement.",
+    )
+    parser.add_argument(
+        "--llm_model",
+        type=str,
+        default=None,
+        help="Optional model name for LLM-based summaries. If omitted, heuristic summaries are used.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.3,
+        help="Temperature for LLM summaries.",
+    )
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=4,
+        help="Parallel workers for summary generation.",
+    )
+    parser.add_argument(
+        "--llm_attempts",
+        type=int,
+        default=2,
+        help="Number of LLM attempts per case before falling back to heuristics.",
+    )
+    parser.add_argument(
+        "--dry_run",
+        action="store_true",
+        help="Run the pipeline without writing to the memory bank.",
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    builder = DebateMemoryBuilder(
+        runs_root=Path(args.runs_root),
+        memory_dir=Path(args.output_dir),
+        mismatch_tolerance=args.mismatch_tolerance,
+        llm_model=args.llm_model,
+        temperature=args.temperature,
+        llm_attempts=args.llm_attempts,
+        max_workers=args.max_workers,
+        datasets=args.datasets,
+        dry_run=args.dry_run,
+    )
+    builder.build()
+if __name__ == "__main__":
+    main()

src/debate_memory/debug_executor.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# -*- coding: utf-8 -*-
+"""Execute generated Python code and capture basic diagnostics."""
+from __future__ import annotations
+import os
+import re
+import subprocess
+import sys
+from dataclasses import dataclass
+from typing import Optional
+AUTO_OBJECTIVE_SNIPPET = """
+# Auto-added snippet: attempt to print the objective value for downstream evaluation.
+try:
+    candidate = None
+    for name in ("model", "m", "Model"):
+        if name in globals():
+            candidate = globals()[name]
+            break
+    if candidate is not None and hasattr(candidate, "objVal"):
+        print(f"OBJECTIVE_VALUE: {candidate.objVal}")
+except Exception:
+    pass
+""".strip()
+@dataclass
+class ExecutionResult:
+    status: str
+    stdout: str
+    stderr: str
+    objective_value: Optional[float]
+    returncode: Optional[int]
+    code_path: Optional[str]
+def _ensure_directory(path: str) -> None:
+    os.makedirs(path, exist_ok=True)
+def _append_objective_snippet(code: str) -> str:
+    if "OBJECTIVE_VALUE" in code:
+        return code if code.endswith("\n") else code + "\n"
+    return f"{code.rstrip()}\n\n{AUTO_OBJECTIVE_SNIPPET}\n"
+def _normalize_output(value: object) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, bytes):
+        return value.decode("utf-8", errors="replace")
+    return str(value)
+def _extract_objective_value(output: str) -> Optional[float]:
+    if not output:
+        return None
+    patterns = [
+        r"OBJECTIVE_VALUE:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
+        r"Optimal\s+[Oo]bjective[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
+        r"Obj:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
+        r"Objective\s+value:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, output, re.IGNORECASE)
+        if not match:
+            continue
+        try:
+            return float(match.group(1))
+        except ValueError:
+            continue
+    return None
+def execute_generated_code(
+    code: str,
+    problem_id: int,
+    output_dir: str,
+    timeout: int = 120,
+) -> ExecutionResult:
+    """Write code to disk, execute it, and capture the outcome."""
+    code_dir = os.path.join(output_dir, "code")
+    _ensure_directory(code_dir)
+    code_with_snippet = _append_objective_snippet(code)
+    code_file = os.path.join(code_dir, f"problem_{problem_id}.py")
+    with open(code_file, "w", encoding="utf-8") as fh:
+        fh.write(code_with_snippet)
+    try:
+        completed = subprocess.run(
+            [sys.executable, os.path.basename(code_file)],
+            cwd=code_dir,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+    except subprocess.TimeoutExpired as exc:
+        return ExecutionResult(
+            status="timeout",
+            stdout=_normalize_output(exc.stdout),
+            stderr=f"Execution timeout after {timeout} seconds",
+            objective_value=None,
+            returncode=None,
+            code_path=code_file,
+        )
+    except Exception as exc:  # pragma: no cover - defensive
+        return ExecutionResult(
+            status="error",
+            stdout="",
+            stderr=str(exc),
+            objective_value=None,
+            returncode=None,
+            code_path=code_file,
+        )
+    stdout = _normalize_output(completed.stdout)
+    stderr = _normalize_output(completed.stderr)
+    returncode = completed.returncode
+    status = "success" if returncode == 0 else "execution_error"
+    objective_value = _extract_objective_value(stdout) if status == "success" else None
+    return ExecutionResult(
+        status=status,
+        stdout=stdout,
+        stderr=stderr,
+        objective_value=objective_value,
+        returncode=returncode,
+        code_path=code_file,
+    )
+__all__ = ["ExecutionResult", "execute_generated_code"]

src/debate_memory/debug_memory.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# -*- coding: utf-8 -*-
+"""Lightweight persistence for debugging experiences."""
+from __future__ import annotations
+import hashlib
+import json
+import threading
+from dataclasses import dataclass, asdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def _normalise_error(text: str) -> str:
+    return (text or "").strip()
+@dataclass
+class DebugRecord:
+    """Single debugging observation stored on disk."""
+    signature: str
+    status: str
+    error_text: str
+    guidance: str
+    problem_id: Optional[int]
+    description: str
+    metadata: Dict[str, Any]
+    timestamp: str
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+_PKG_DIR = Path(__file__).resolve().parent
+_PROJECT_ROOT = _PKG_DIR.parent.parent
+class DebugMemoryStore:
+    """Append-only store keyed by error signature."""
+    DEFAULT_PATH = _PROJECT_ROOT / "memory_storage" / "debug_memory.jsonl"
+    def __init__(self, path: Optional[str] = None):
+        self.path = Path(path) if path else self.DEFAULT_PATH
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        if not self.path.exists():
+            self.path.touch()
+        self._lock = threading.Lock()
+    @staticmethod
+    def _signature_from_error(error_text: str, status: str) -> str:
+        basis = _normalise_error(error_text)
+        if not basis:
+            basis = status or "unknown"
+        digest = hashlib.sha1(basis.encode("utf-8")).hexdigest()[:12]
+        return digest
+    def _append(self, record: DebugRecord) -> None:
+        payload = json.dumps(record.to_dict(), ensure_ascii=False)
+        with self._lock, self.path.open("a", encoding="utf-8") as fh:
+            fh.write(payload + "\n")
+    def record_execution_feedback(
+        self,
+        *,
+        problem_id: Optional[int],
+        description: str,
+        status: str,
+        error_text: str,
+        guidance: str,
+        source: str,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> str:
+        """Persist execution feedback and return the signature used."""
+        signature_core = self._signature_from_error(error_text, status)
+        signature = f"exec:{signature_core}"
+        record = DebugRecord(
+            signature=signature,
+            status=status or "unknown",
+            error_text=_normalise_error(error_text) or status or "",
+            guidance=(guidance or "").strip(),
+            problem_id=problem_id,
+            description=(description or "").strip(),
+            metadata={
+                "source": source,
+                **(metadata or {}),
+            },
+            timestamp=_now_iso(),
+        )
+        self._append(record)
+        return signature
+    def record_validation_feedback(
+        self,
+        *,
+        problem_id: Optional[int],
+        issues: Iterable[str],
+        metadata: Optional[Dict[str, Any]] = None,
+        source: str = "validation",
+    ) -> List[str]:
+        """Persist validation feedback items and return the signatures used."""
+        signatures: List[str] = []
+        for issue in issues:
+            if not issue:
+                continue
+            signature_core = self._signature_from_error(issue, "validation")
+            signature = f"validation:{signature_core}"
+            record = DebugRecord(
+                signature=signature,
+                status="validation",
+                error_text=_normalise_error(issue),
+                guidance="",
+                problem_id=problem_id,
+                description="",
+                metadata={
+                    "source": source,
+                    **(metadata or {}),
+                },
+                timestamp=_now_iso(),
+            )
+            self._append(record)
+            signatures.append(signature)
+        return signatures
+    def retrieve_for_problem(self, problem_id: int, limit: int = 3) -> List[DebugRecord]:
+        """Return recent records for a given problem id (best-effort)."""
+        if problem_id is None:
+            return []
+        matches: List[DebugRecord] = []
+        with self.path.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    payload = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                if payload.get("problem_id") != problem_id:
+                    continue
+                matches.append(
+                    DebugRecord(
+                        signature=payload.get("signature", ""),
+                        status=payload.get("status", ""),
+                        error_text=payload.get("error_text", ""),
+                        guidance=payload.get("guidance", ""),
+                        problem_id=payload.get("problem_id"),
+                        description=payload.get("description", ""),
+                        metadata=payload.get("metadata", {}) or {},
+                        timestamp=payload.get("timestamp", ""),
+                    )
+                )
+        matches.sort(key=lambda item: item.timestamp, reverse=True)
+        return matches[:limit] if limit else matches
+__all__ = ["DebugMemoryStore", "DebugRecord"]

src/debate_memory/debug_memory_builder.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Convert debug_memory.jsonl records into a searchable MemoryBank."""
+from __future__ import annotations
+import argparse
+import glob
+import hashlib
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional
+from .memory_bank import MemoryBank
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+LEGACY_ROOT = PROJECT_ROOT.parent / "debate_with_memory"
+def _default_inputs() -> List[str]:
+    candidates = [
+        PROJECT_ROOT / "memory_storage" / "debug_memory.jsonl",
+        LEGACY_ROOT / "memory_storage" / "debug_memory.jsonl",
+        PROJECT_ROOT / "memory_storage" / "backups" / "*" / "debug_memory.jsonl",
+        LEGACY_ROOT / "memory_storage" / "backups" / "*" / "debug_memory.jsonl",
+    ]
+    return [str(path) for path in candidates]
+def _stable_id(signature: str) -> int:
+    digest = hashlib.sha1(signature.encode("utf-8")).hexdigest()
+    return int(digest[:12], 16)
+def _parse_timestamp(ts: Optional[str]) -> datetime:
+    if not ts:
+        return datetime.min
+    try:
+        return datetime.fromisoformat(ts)
+    except ValueError:
+        return datetime.min
+def load_debug_records(input_globs: List[str]) -> Dict[str, Dict]:
+    records: Dict[str, Dict] = {}
+    files: List[str] = []
+    for pattern in input_globs:
+        files.extend(glob.glob(pattern))
+    files = sorted({Path(f) for f in files if Path(f).exists()})
+    for file_path in files:
+        with file_path.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    record = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                signature = record.get("signature")
+                if not signature:
+                    continue
+                ts = _parse_timestamp(record.get("timestamp"))
+                existing = records.get(signature)
+                if existing is None or ts > existing.get("_ts", datetime.min):
+                    record["_ts"] = ts
+                    records[signature] = record
+    return records
+def build_debug_memory(records: Dict[str, Dict], output_dir: Path, clear: bool) -> None:
+    if clear and output_dir.exists():
+        for child in output_dir.iterdir():
+            if child.is_file():
+                child.unlink()
+            else:
+                import shutil
+                shutil.rmtree(child)
+    bank = MemoryBank(memory_dir=str(output_dir))
+    added = 0
+    for signature, record in records.items():
+        description = record.get("description", "Unknown problem")
+        error_text = record.get("error_text", "")
+        guidance = record.get("guidance", "")
+        status = record.get("status", "")
+        metadata = {
+            "signature": signature,
+            "status": status,
+            "timestamp": record.get("timestamp"),
+            **(record.get("metadata") or {}),
+        }
+        note_lines = ["# Debug Memory Case", f"Signature: {signature}", f"Status: {status}"]
+        if guidance:
+            note_lines.append(f"Guidance: {guidance}")
+        note_lines.append("---")
+        if error_text:
+            note_lines.append("Error snippet:\n" + error_text)
+        note_lines.append("---")
+        note_lines.append(f"Source metadata: {metadata}")
+        prompt_desc = (
+            f"{description}\n\n## Error Details\n```\n{error_text}\n```\n"
+            f"## Guidance\n{guidance or 'N/A'}\n"
+        )
+        problem_id = record.get("problem_id")
+        if problem_id is None:
+            problem_id = _stable_id(signature)
+        try:
+            bank.add_case(
+                problem_id=int(problem_id),
+                problem_desc=prompt_desc,
+                solution_code="\n".join(note_lines),
+                objective_value=0.0,
+                is_correct=True,
+                metadata=metadata,
+            )
+            added += 1
+        except Exception as exc:  # noqa: BLE001
+            print(f"Failed to add debug case {signature}: {exc}")
+    print(f"✅ Added {added} debug cases to {output_dir}")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Build debug memory bank from debug_memory.jsonl records")
+    parser.add_argument(
+        "--input", nargs="*", default=_default_inputs(), help="Input files/globs containing debug records",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=str(PROJECT_ROOT / "debug_case_memory"),
+        help="Where to store the constructed memory bank",
+    )
+    parser.add_argument(
+        "--clear",
+        action="store_true",
+        help="Remove existing output_dir contents before rebuilding",
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    records = load_debug_records(args.input)
+    print(f"Loaded {len(records)} unique debug signatures")
+    build_debug_memory(records, Path(args.output_dir), clear=args.clear)
+if __name__ == "__main__":
+    main()

src/debate_memory/debug_utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# -*- coding: utf-8 -*-
+"""Minimal helpers for generated code execution reports."""
+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass, asdict
+from typing import List, Optional
+from .debug_memory import DebugMemoryStore
+@dataclass
+class DebugMetadata:
+    problem_id: int
+    notes: List[str]
+    def to_json(self) -> str:
+        return json.dumps(asdict(self), ensure_ascii=False, indent=2)
+def sanitize_code(code: str, problem_id: int):
+    """Ensure code ends with a newline and capture any lightweight notes."""
+    metadata = DebugMetadata(problem_id=problem_id, notes=[])
+    cleaned = (code or "").rstrip() + "\n" if code else ""
+    return cleaned, metadata
+def save_debug_metadata(metadata: DebugMetadata, output_dir: str) -> None:
+    """Persist metadata only when there is something noteworthy."""
+    if not metadata.notes:
+        return
+    debug_dir = os.path.join(output_dir, "debug")
+    os.makedirs(debug_dir, exist_ok=True)
+    path = os.path.join(debug_dir, f"problem_{metadata.problem_id}.json")
+    with open(path, "w", encoding="utf-8") as fh:
+        fh.write(metadata.to_json())
+def write_debug_suggestions(
+    problem_id: int,
+    description: str,
+    error_message: str,
+    memory_helper,
+    memory_bank,
+    output_dir: str,
+    *,
+    status: str,
+    debug_store: Optional[DebugMemoryStore] = None,
+    top_k_cases: int = 3,
+) -> None:
+    """Write a straightforward debug report and optionally record the memory."""
+    _ = memory_helper, memory_bank, top_k_cases  # Unused but kept for interface compatibility.
+    debug_dir = os.path.join(output_dir, "debug")
+    os.makedirs(debug_dir, exist_ok=True)
+    path = os.path.join(debug_dir, f"problem_{problem_id}_suggestions.md")
+    lines: List[str] = [
+        f"# Debug Report for Problem {problem_id}",
+        "",
+        f"- **Status:** {status}",
+    ]
+    if description:
+        lines.extend(["", "## Description", description.strip(), ""])
+    if error_message:
+        lines.extend(
+            [
+                "## Error Traceback",
+                "```",
+                error_message.strip(),
+                "```",
+                "",
+            ]
+        )
+    else:
+        lines.extend(["", "## Error Traceback", "_No traceback captured._", ""])
+    lines.append("## Notes")
+    lines.append("")
+    lines.append("Automated debugging is not yet implemented. Review the trace above for hints.")
+    lines.append("")
+    with open(path, "w", encoding="utf-8") as fh:
+        fh.write("\n".join(lines))
+    if debug_store:
+        debug_store.record_execution_feedback(
+            problem_id=problem_id,
+            description=description,
+            status=status,
+            error_text=error_message or status,
+            guidance="Automated debugging is not yet implemented.",
+            source="debug_utils.write_debug_suggestions",
+            metadata={},
+        )
+__all__ = ["DebugMetadata", "sanitize_code", "save_debug_metadata", "write_debug_suggestions"]

src/debate_memory/execute.py ADDED Viewed

	@@ -0,0 +1,522 @@

+"""
+Execute and evaluate generated Gurobi code
+"""
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+from .debug_utils import sanitize_code, save_debug_metadata, write_debug_suggestions
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent.parent
+DEFAULT_MEMORY_DIR = PROJECT_ROOT / "memory_storage"
+DEFAULT_GUIDELINES = DEFAULT_MEMORY_DIR / "category_guidelines.jsonl"
+DEFAULT_DEBUG_MEMORY = DEFAULT_MEMORY_DIR / "debug_memory.jsonl"
+def extract_objective_value(output: str) -> float:
+    """
+    Extract objective value from Gurobi output
+    Args:
+        output: stdout from Gurobi code execution
+    Returns:
+        Objective value as float, or None if not found
+    """
+    if not output or output.strip() == "":
+        return None
+    # Common patterns in Gurobi output
+    patterns = [
+        r'Optimal\s+[Oo]bjective[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
+        r'Obj:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
+        r'Best\s+objective\s+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
+        r'Objective\s+value:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
+        r'OBJECTIVE_VALUE:\s*([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',  # Our auto-added pattern
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, output, re.IGNORECASE)
+        if match:
+            try:
+                return float(match.group(1))
+            except ValueError:
+                continue
+    # Fallback: check common custom labels printed by prompts
+    fallback_patterns = [
+        r'Total\s+Cost[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
+        r'Total\s+Profit[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
+        r'Total\s+Net\s+Profit[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
+        r'Total\s+Revenue[:\s]+([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)',
+    ]
+    for pattern in fallback_patterns:
+        match = re.search(pattern, output, re.IGNORECASE)
+        if match:
+            try:
+                return float(match.group(1))
+            except ValueError:
+                continue
+    return None
+def enhance_code_with_objective_print(code: str) -> str:
+    """
+    Add objective value printing to code if not already present
+    This helps ensure we can extract the objective value even if
+    the generated code doesn't print it explicitly.
+    Note: Always adds a fallback print to handle cases where existing
+    prints are conditional (e.g., inside if status == OPTIMAL blocks)
+    """
+    # Add code to print objective value (always add as a safety measure)
+    enhancement_code = """
+# Auto-added: Print objective value for evaluation (fallback)
+try:
+    # Try common variable names for Gurobi model
+    if 'model' in dir():
+        mdl = model
+    elif 'm' in dir():
+        mdl = m
+    elif 'Model' in dir():
+        mdl = Model
+    else:
+        mdl = None
+    # Fallback: scan globals for a likely Gurobi model instance.
+    # This helps when the generated code uses a non-standard variable name.
+    if mdl is None:
+        try:
+            for _name, _val in list(globals().items()):
+                try:
+                    if hasattr(_val, 'objVal') and hasattr(_val, 'optimize'):
+                        mdl = _val
+                        break
+                except Exception:
+                    continue
+        except Exception:
+            pass
+    if mdl is not None and hasattr(mdl, 'objVal'):
+        try:
+            obj_value = mdl.objVal
+            print(f"OBJECTIVE_VALUE: {obj_value}")
+        except:
+            # Model might not have been solved yet
+            pass
+except:
+    pass
+"""
+    return code + "\n" + enhancement_code
+def execute_code(code: str, problem_id: int, output_dir: str, timeout: int = 60) -> Dict:
+    """
+    Execute Gurobi code and capture results
+    Args:
+        code: Python code to execute
+        problem_id: Problem ID
+        output_dir: Directory to save code files
+        timeout: Execution timeout in seconds
+    Returns:
+        Dictionary with execution results
+    """
+    # Create output directory
+    code_dir = os.path.join(output_dir, 'code')
+    os.makedirs(code_dir, exist_ok=True)
+    sanitized_code, debug_meta = sanitize_code(code, problem_id)
+    code_enhanced = enhance_code_with_objective_print(sanitized_code)
+    # Save code to file
+    code_file = os.path.join(code_dir, f'problem_{problem_id}.py')
+    with open(code_file, 'w', encoding='utf-8') as f:
+        f.write(code_enhanced)
+    # Persist debug metadata if anything noteworthy was detected
+    save_debug_metadata(debug_meta, output_dir)
+    # Execute code
+    try:
+        result = subprocess.run(
+            [sys.executable, f'problem_{problem_id}.py'],
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            cwd=code_dir
+        )
+        stdout = result.stdout
+        stderr = result.stderr
+        returncode = result.returncode
+        if returncode == 0:
+            obj_value = extract_objective_value(stdout)
+            if obj_value is not None:
+                return {
+                    'status': 'success',
+                    'objective_value': obj_value,
+                    'stdout': stdout,
+                    'stderr': stderr
+                }
+            else:
+                return {
+                    'status': 'success_no_objective',
+                    'objective_value': None,
+                    'stdout': stdout,
+                    'stderr': stderr
+                }
+        else:
+            return {
+                'status': 'execution_error',
+                'objective_value': None,
+                'stdout': stdout,
+                'stderr': stderr,
+                'returncode': returncode
+            }
+    except subprocess.TimeoutExpired:
+        return {
+            'status': 'timeout',
+            'objective_value': None,
+            'stdout': '',
+            'stderr': f'Execution timeout after {timeout} seconds'
+        }
+    except Exception as e:
+        return {
+            'status': 'error',
+            'objective_value': None,
+            'stdout': '',
+            'stderr': str(e)
+        }
+def check_correctness(pred_obj: float, gt_obj: float, tolerance: float = 0.05,
+                     use_relative: bool = True) -> bool:
+    """
+    Check if predicted objective matches ground truth
+    Args:
+        pred_obj: Predicted objective value
+        gt_obj: Ground truth objective value
+        tolerance: Tolerance for comparison
+        use_relative: Use relative tolerance if True, absolute if False
+    Returns:
+        True if values match within tolerance
+    """
+    if pred_obj is None or gt_obj is None:
+        return False
+    try:
+        pred_obj = float(pred_obj)
+        gt_obj = float(gt_obj)
+        if gt_obj == 0:
+            return abs(pred_obj) <= tolerance
+        if use_relative:
+            return abs((pred_obj - gt_obj) / gt_obj) <= tolerance
+        else:
+            return abs(pred_obj - gt_obj) <= tolerance
+    except (ValueError, TypeError):
+        return False
+def evaluate_results(results: List[Dict], args) -> Dict:
+    """
+    Evaluate execution results
+    Args:
+        results: List of result dictionaries
+        args: Command line arguments
+    Returns:
+        Evaluation report dictionary
+    """
+    total = len(results)
+    correct = 0
+    status_counts = defaultdict(int)
+    correct_ids = []
+    incorrect_details = []
+    for result in results:
+        status = result['execution_status']
+        status_counts[status] += 1
+        if status == 'success' and result['is_correct']:
+            correct += 1
+            correct_ids.append(result['id'])
+        elif status == 'success' and not result['is_correct']:
+            incorrect_details.append({
+                'id': result['id'],
+                'predicted': result['predicted_objective'],
+                'ground_truth': result['ground_truth']
+            })
+    accuracy = correct / total if total > 0 else 0.0
+    report = {
+        'total_problems': total,
+        'correct': correct,
+        'accuracy': accuracy,
+        'status_counts': dict(status_counts),
+        'correct_ids': correct_ids,
+        'incorrect_details': incorrect_details[:10],  # Save first 10 for reference
+        'settings': {
+            'tolerance': args.tolerance,
+            'use_relative_tolerance': args.use_relative_tolerance,
+            'timeout': args.timeout
+        }
+    }
+    return report
+def process_single_problem(gen_result, args):
+    """Process a single problem (for parallel execution)"""
+    problem_id = gen_result['id']
+    code = gen_result['generated_code']
+    gt_answer = gen_result.get('answer')
+    if not code:
+        result = {
+            'id': problem_id,
+            'execution_status': 'no_code',
+            'predicted_objective': None,
+            'ground_truth': gt_answer,
+            'is_correct': False
+        }
+    else:
+        exec_result = execute_code(code, problem_id, args.output_dir, args.timeout)
+        pred_obj = exec_result['objective_value']
+        is_correct = False
+        if pred_obj is not None and gt_answer is not None:
+            try:
+                gt_obj = float(gt_answer)
+                is_correct = check_correctness(
+                    pred_obj, gt_obj,
+                    args.tolerance,
+                    args.use_relative_tolerance
+                )
+            except (ValueError, TypeError):
+                is_correct = False
+        result = {
+            'id': problem_id,
+            'execution_status': exec_result['status'],
+            'predicted_objective': pred_obj,
+            'ground_truth': gt_answer,
+            'is_correct': is_correct,
+            'stdout': exec_result['stdout'][:500] if args.save_output else '',
+            'stderr': exec_result['stderr'][:500] if args.save_output else ''
+        }
+    return result
+def main(args):
+    # Load generated results
+    if not os.path.exists(args.input_file):
+        raise FileNotFoundError(f"Input file not found: {args.input_file}")
+    with open(args.input_file, 'r', encoding='utf-8') as f:
+        generated_results = [json.loads(line) for line in f if line.strip()]
+    print(f"Loaded {len(generated_results)} generated results")
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    id_to_problem = {record['id']: record for record in generated_results}
+    debug_store = None
+    memory_helper = None
+    memory_bank = None
+    if not args.disable_debug_memory:
+        try:
+            from .debug_memory import DebugMemoryStore
+            from .memory_bank import MemoryBank
+            from .memory_intelligence import MemoryIntelligence
+        except ModuleNotFoundError as exc:
+            print(
+                f"⚠️  Debug-memory dependencies missing ({exc}). "
+                "Continuing with --disable_debug_memory behavior."
+            )
+            args.disable_debug_memory = True
+        else:
+            debug_store = DebugMemoryStore(args.debug_memory_path)
+            if args.category_guidelines_path:
+                try:
+                    memory_helper = MemoryIntelligence(args.category_guidelines_path)
+                except Exception as exc:  # noqa: BLE001
+                    print(f"Warning: failed to load category guidelines ({exc})")
+            if args.memory_dir:
+                try:
+                    if args.embedding_model:
+                        memory_bank = MemoryBank(args.memory_dir, embedding_model=args.embedding_model)
+                    else:
+                        memory_bank = MemoryBank(args.memory_dir)
+                except Exception as exc:  # noqa: BLE001
+                    print(f"Warning: failed to load memory bank from {args.memory_dir} ({exc})")
+    # Execute and evaluate each result
+    evaluation_results = []
+    if args.num_workers > 1:
+        # Parallel execution
+        print(f"Using {args.num_workers} workers for parallel execution")
+        with ProcessPoolExecutor(max_workers=args.num_workers) as executor:
+            # Submit all tasks
+            future_to_problem = {
+                executor.submit(process_single_problem, gen_result, args): gen_result
+                for gen_result in generated_results
+            }
+            # Collect results with progress bar
+            with tqdm(total=len(generated_results), desc="Executing") as pbar:
+                for future in as_completed(future_to_problem):
+                    try:
+                        result = future.result()
+                        evaluation_results.append(result)
+                        status_symbol = '✓' if result['is_correct'] else '✗'
+                        pbar.set_postfix_str(f"Problem {result['id']}: {status_symbol}")
+                        pbar.update(1)
+                    except Exception as e:
+                        gen_result = future_to_problem[future]
+                        print(f"\nError processing problem {gen_result['id']}: {e}")
+                        evaluation_results.append({
+                            'id': gen_result['id'],
+                            'execution_status': 'error',
+                            'predicted_objective': None,
+                            'ground_truth': gen_result.get('answer'),
+                            'is_correct': False,
+                            'stdout': '',
+                            'stderr': str(e)
+                        })
+                        pbar.update(1)
+        # Sort results by ID to maintain order
+        evaluation_results.sort(key=lambda x: x['id'])
+    else:
+        # Sequential execution (original behavior)
+        for gen_result in generated_results:
+            problem_id = gen_result['id']
+            print(f"Processing problem {problem_id}...", end=' ')
+            result = process_single_problem(gen_result, args)
+            evaluation_results.append(result)
+            status_symbol = '✓' if result['is_correct'] else '✗'
+            print(f"{status_symbol} [{result['execution_status']}]")
+    # Provide memory-aided suggestions for failures
+    if not args.disable_debug_memory:
+        for result in evaluation_results:
+            status = result['execution_status']
+            if status in ('execution_error', 'success_no_objective', 'timeout', 'no_code'):
+                gen_result = id_to_problem.get(result['id'], {})
+                description = gen_result.get('description', '')
+                error_message = result.get('stderr') or result.get('stdout') or ''
+                if not error_message:
+                    if status == 'timeout':
+                        error_message = 'Execution timeout'
+                    elif status == 'no_code':
+                        error_message = 'No code was generated for execution.'
+                    elif status == 'success_no_objective':
+                        error_message = 'Execution succeeded but no objective value was captured.'
+                write_debug_suggestions(
+                    problem_id=result['id'],
+                    description=description,
+                    error_message=error_message,
+                    memory_helper=memory_helper,
+                    memory_bank=memory_bank,
+                    output_dir=args.output_dir,
+                    status=status,
+                    debug_store=debug_store,
+                )
+    # Generate evaluation report
+    report = evaluate_results(evaluation_results, args)
+    # Save detailed results
+    results_file = os.path.join(args.output_dir, 'evaluation_results.jsonl')
+    with open(results_file, 'w', encoding='utf-8') as f:
+        for result in evaluation_results:
+            f.write(json.dumps(result, ensure_ascii=False) + '\n')
+    # Save evaluation report
+    report_file = os.path.join(args.output_dir, 'evaluation_report.json')
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    # Print summary
+    print(f"\n{'='*60}")
+    print("EVALUATION SUMMARY")
+    print(f"{'='*60}")
+    print(f"Total problems:  {report['total_problems']}")
+    print(f"Correct:         {report['correct']}")
+    print(f"Accuracy:        {report['accuracy']:.2%}")
+    print(f"\nStatus breakdown:")
+    for status, count in sorted(report['status_counts'].items()):
+        print(f"  {status:20s}: {count:3d} ({count/report['total_problems']:.1%})")
+    print(f"{'='*60}")
+    print(f"\nResults saved to:")
+    print(f"  {results_file}")
+    print(f"  {report_file}")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Execute and evaluate generated Gurobi code")
+    parser.add_argument("--input_file", type=str, required=True,
+                        help="Path to generated results JSONL file")
+    parser.add_argument("--output_dir", type=str, required=True,
+                        help="Directory to save execution results")
+    parser.add_argument("--timeout", type=int, default=60,
+                        help="Timeout for code execution (seconds)")
+    parser.add_argument("--tolerance", type=float, default=0.05,
+                        help="Tolerance for answer comparison")
+    parser.add_argument("--use_relative_tolerance", action="store_true",
+                        help="Use relative tolerance (default: absolute)")
+    parser.add_argument("--save_output", action="store_true",
+                        help="Save stdout/stderr in results")
+    parser.add_argument("--num_workers", type=int, default=100,
+                        help="Number of parallel workers for execution")
+    parser.add_argument("--memory_dir", type=str, default=str(DEFAULT_MEMORY_DIR),
+                        help="Path to episodic memory directory (used for debug suggestions)")
+    parser.add_argument("--embedding_model", type=str, default=None,
+                        help="Optional embedding model name or local path for debug-memory retrieval")
+    parser.add_argument("--category_guidelines_path", type=str,
+                        default=str(DEFAULT_GUIDELINES),
+                        help="Path to category guideline JSONL file")
+    parser.add_argument("--debug_memory_path", type=str,
+                        default=str(DEFAULT_DEBUG_MEMORY),
+                        help="Path to persistent debug memory JSONL file")
+    parser.add_argument("--disable_debug_memory", action="store_true",
+                        help="Disable memory-based debug suggestions")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

src/debate_memory/generate_with_memory.py ADDED Viewed

	@@ -0,0 +1,920 @@

+"""
+Generate with Memory: Single solution generation enhanced by memory retrieval
+Based on simple_rag/generate.py + memory enhancement
+"""
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+from collections import Counter
+from typing import Dict, List, Optional
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Import local utilities
+from .llm import get_response
+from .config import find_benchmark_path, get_prompt_template, normalize_dataset_name
+# Import memory bank
+from .memory_bank import MemoryBank
+from .debug_memory import DebugMemoryStore
+from .debug_executor import execute_generated_code, ExecutionResult
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+DEFAULT_MEMORY_DIR = PROJECT_ROOT / "memory_storage"
+DEFAULT_DEBUG_MEMORY = DEFAULT_MEMORY_DIR / "debug_memory.jsonl"
+DEFAULT_DEBUG_CASE_MEMORY = PROJECT_ROOT / "debug_case_memory"
+class NoOpMemoryBank:
+    """Memory-bank stub used when retrieval is explicitly disabled."""
+    case_count = 0
+    def retrieve_similar_cases(self, query: str, top_k: int = 0):
+        return []
+    def format_retrieved_cases_for_prompt(self, similar_cases):
+        return ""
+def load_dataset(dataset_name: str) -> List[Dict]:
+    """
+    Load dataset from the migrated benchmark directory layout.
+    Args:
+        dataset_name: Name of the dataset (e.g., "ComplexLP", "IndustryOR")
+    Returns:
+        List of problem dictionaries with 'description' and 'answer' fields
+    """
+    dataset_name = normalize_dataset_name(dataset_name)
+    dataset_path = find_benchmark_path(PROJECT_ROOT, dataset_name)
+    problems = []
+    with dataset_path.open('r', encoding='utf-8') as f:
+        for idx, line in enumerate(f):
+            if line.strip():
+                data = json.loads(line)
+                # Map en_question to description if it exists
+                if 'en_question' in data and 'description' not in data:
+                    data['description'] = data['en_question']
+                # Map en_answer to answer if it exists
+                if 'en_answer' in data and 'answer' not in data:
+                    data['answer'] = data['en_answer']
+                # Set id if not already present
+                if 'id' not in data:
+                    data['id'] = idx
+                problems.append(data)
+    print(f"Loaded {len(problems)} problems from {dataset_name}")
+    return problems
+def extract_python_code(text: str) -> str:
+    """
+    Extract Python code from LLM output
+    Looks for code within <python>...</python> tags or ```python...``` blocks
+    Args:
+        text: LLM output text
+    Returns:
+        Extracted Python code
+    """
+    # Try to extract from <python>...</python> tags first
+    pattern_xml = r'<python>(.*?)</python>'
+    match = re.search(pattern_xml, text, re.DOTALL | re.IGNORECASE)
+    if match:
+        code = match.group(1).strip()
+        # Remove markdown code fences if present
+        code = re.sub(r'^```python\s*\n', '', code)
+        code = re.sub(r'\n```\s*$', '', code)
+        return code
+    # Try to extract from ```python...``` blocks
+    pattern_markdown = r'```python(.*?)```'
+    match = re.search(pattern_markdown, text, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    # If no code blocks found, return empty string
+    return ""
+def _truncate_text(text: str, limit: int = 1200) -> str:
+    if isinstance(text, bytes):
+        text = text.decode("utf-8", errors="replace")
+    snippet = (text or "").strip()
+    if not snippet:
+        return ""
+    if len(snippet) <= limit:
+        return snippet
+    return snippet[:limit] + "\n... (truncated)"
+def write_debug_report(
+    problem_id: int,
+    description: str,
+    exec_result: ExecutionResult,
+    base_output_dir: str,
+) -> str:
+    debug_dir = os.path.join(base_output_dir, "debug")
+    os.makedirs(debug_dir, exist_ok=True)
+    path = os.path.join(debug_dir, f"problem_{problem_id}_debug.md")
+    stdout_snippet = _truncate_text(exec_result.stdout)
+    stderr_snippet = _truncate_text(exec_result.stderr)
+    lines = [
+        f"# Debug Report for Problem {problem_id}",
+        "",
+        f"- **Status:** {exec_result.status}",
+    ]
+    if exec_result.code_path:
+        rel_path = os.path.relpath(exec_result.code_path, base_output_dir)
+        lines.append(f"- **Code path:** {rel_path}")
+    if description:
+        lines.extend(["", "## Description", description.strip()])
+    if stdout_snippet:
+        lines.extend(["", "## Stdout", "```", stdout_snippet, "```"])
+    if stderr_snippet:
+        lines.extend(["", "## Stderr", "```", stderr_snippet, "```"])
+    if not stdout_snippet and not stderr_snippet:
+        lines.extend(["", "## Logs", "_No logs captured._"])
+    with open(path, "w", encoding="utf-8") as fh:
+        fh.write("\n".join(lines) + "\n")
+    return path
+def filter_perfect_matches(similar_cases: List[Dict], current_description: str, max_filter: int = 1) -> List[Dict]:
+    """
+    Filter out cases with identical description (test set leakage)
+    At most max_filter cases will be removed (default: 1)
+    Args:
+        similar_cases: List of retrieved cases with scores
+        current_description: The description of current problem to compare against
+        max_filter: Maximum number of perfect matches to filter out (default: 1)
+    Returns:
+        Filtered list of cases
+    """
+    filtered = []
+    filtered_count = 0
+    for case in similar_cases:
+        case_desc = case['case'].get('description', '')
+        problem_id = case['case'].get('problem_id', '?')
+        score = case.get('score', 0.0)
+        # Compare descriptions directly (exact match)
+        # At most filter max_filter identical cases
+        if case_desc.strip() == current_description.strip() and filtered_count < max_filter:
+            filtered_count += 1
+            print(f"  ⚠️  Filtered: Case ID={problem_id}, similarity={score:.4f} (identical description, test set leakage)")
+        else:
+            filtered.append(case)
+    if filtered_count > 0:
+        print(f"  📊 Filtered {filtered_count} perfect match(es) (max: {max_filter}), {len(filtered)} cases remaining")
+    return filtered
+def refine_retrieved_cases_with_llm(
+    similar_cases: List[Dict],
+    current_problem_desc: str,
+    model: str,
+    temperature: float = 0.3
+) -> str:
+    """
+    Use LLM to analyze ALL retrieved cases together and extract key insights
+    This is a two-stage process:
+    1. Retrieve similar cases (vector similarity)
+    2. Use LLM to view ALL cases holistically and extract transferable insights
+    Args:
+        similar_cases: List of retrieved cases
+        current_problem_desc: Current problem description
+        model: Model name for analysis
+        temperature: Temperature for analysis (slightly higher for creativity)
+    Returns:
+        Refined insights as a string
+    """
+    if not similar_cases:
+        return ""
+    # Build full cases content (no truncation - show everything to LLM)
+    full_cases = ""
+    for i, item in enumerate(similar_cases, 1):
+        case = item['case']
+        score = item['score']
+        full_cases += f"\n{'='*70}\n"
+        full_cases += f"Case {i} (Similarity Score: {score:.3f})\n"
+        full_cases += f"{'='*70}\n\n"
+        full_cases += f"**Problem Description:**\n{case['description']}\n\n"
+        full_cases += f"**Complete Solution Code:**\n```python\n{case['solution_code']}\n```\n\n"
+        full_cases += f"**Objective Value:** {case['objective_value']}\n"
+        full_cases += f"**Status:** Correct ✓\n"
+        full_cases += "\n"
+    analysis_prompt = f"""You are an expert in optimization modeling. You will analyze multiple similar solved problems to extract **transferable insights** for a new problem.
+## Current Problem to Solve:
+{current_problem_desc}
+## Retrieved Similar Cases (Complete):
+{full_cases}
+## Your Task:
+Analyze ALL the cases above **holistically** and provide a structured analysis that will guide solving the current problem.
+**Focus on:**
+1. **Problem Type & Structure**: What category do these problems fall into? (e.g., production planning, resource allocation, scheduling, network flow)
+2. **Common Modeling Patterns**:
+   - What decision variables are typically used?
+   - What types of constraints appear repeatedly?
+   - How are objectives typically formulated?
+3. **Key Techniques & Tricks**:
+   - Any specific Gurobi features? (e.g., `addConstrs`, `quicksum`, binary variables, `setParam`)
+   - Modeling tricks? (e.g., big-M, indicator constraints, piecewise linear)
+   - Data structure patterns? (e.g., dictionaries for indices, list comprehensions)
+4. **Adaptation Guidance**:
+   - What aspects of the current problem are similar to the retrieved cases?
+   - What's different and requires new thinking?
+   - Which parts of the solution approaches can be directly adapted?
+**Output Format**:
+Provide a concise, actionable analysis (300-500 words) structured by the 4 points above. Be specific with code patterns and techniques, not just high-level descriptions.
+**Important**: Extract **transferable knowledge**, not just summarize. Think about what the solver needs to know to adapt these solutions to the current problem."""
+    try:
+        analysis = get_response(analysis_prompt, model=model, temperature=temperature)
+        return analysis
+    except Exception as e:
+        print(f"  ⚠️  Warning: Failed to refine cases with LLM: {e}")
+        # Fallback: return empty string, will use original formatting
+        return ""
+def format_debug_cases_for_prompt(cases: List[Dict]) -> str:
+    if not cases:
+        return ""
+    lines = ["# Retrieved Debug Guidance", ""]
+    for idx, item in enumerate(cases, 1):
+        case = item["case"]
+        score = item.get("score")
+        signature = case.get("metadata", {}).get("signature", "unknown")
+        status = case.get("metadata", {}).get("status", "")
+        lines.append(f"## Case {idx} (similarity {score:.3f})")
+        lines.append(f"Signature: {signature} | Status: {status}")
+        description = case.get("description", "").strip()
+        if description:
+            lines.append(description if len(description) < 800 else description[:800] + "\n...")
+        lines.append("---")
+    return "\n".join(lines).strip()
+def build_error_feedback_prompt(
+    exec_result: ExecutionResult,
+    attempt_number: int,
+    previous_code: str,
+    debug_guidance: str = ""
+) -> str:
+    """
+    Build a prompt with error feedback for code correction
+    Args:
+        exec_result: Execution result with error information
+        attempt_number: Current attempt number
+        previous_code: The code that failed
+    Returns:
+        Feedback prompt string
+    """
+    error_info = exec_result.stderr if exec_result.stderr else exec_result.stdout
+    if not error_info:
+        error_info = f"Status: {exec_result.status}"
+    feedback = f"""
+# Code Execution Failed - Attempt {attempt_number}
+Your previous code failed to execute successfully. Here is the error information:
+## Error Details:
+```
+{error_info}
+```
+## Your Previous Code:
+```python
+{previous_code}
+```
+## Instructions:
+1. Carefully analyze the error message above
+2. Identify the root cause of the error
+3. Fix the code to resolve the issue
+4. Common issues to check:
+   - Variable indexing (e.g., accessing index 0 when valid indices start from 1)
+   - Missing variable definitions
+   - Incorrect constraint formulations
+   - Type mismatches
+Please provide the CORRECTED code in a ```python``` code block. Make sure to:
+- Fix the specific error mentioned above
+- Keep the overall structure and logic intact
+- Ensure all variables are properly defined before use
+"""
+    if debug_guidance:
+        feedback += f"\n\n# Historical Debug Guidance\n{debug_guidance}\n"
+    return feedback
+def generate_with_memory(
+    problem_id: int,
+    problem_desc: str,
+    memory_bank: MemoryBank,
+    model: str,
+    temperature: float,
+    top_k: int = 4,
+    filter_perfect: bool = True,
+    use_llm_refinement: bool = True,
+    *,
+    auto_debug: bool = True,
+    execution_timeout: int = 120,
+    debug_output_dir: Optional[str] = None,
+    debug_store: Optional[DebugMemoryStore] = None,
+    max_retries: int = 3,
+    debug_case_bank: Optional[MemoryBank] = None,
+    debug_case_top_k: int = 3
+) -> Dict:
+    """
+    Generate solution with memory enhancement
+    Args:
+        problem_id: Problem ID
+        problem_desc: Problem description
+        memory_bank: Memory bank instance
+        model: Model name
+        temperature: Generation temperature
+        top_k: Number of cases to retrieve (default: 4, will filter identical descriptions)
+        filter_perfect: Whether to filter out identical description matches
+        use_llm_refinement: Whether to use LLM to refine/summarize retrieved cases
+        auto_debug: Execute generated code and capture debug information
+        execution_timeout: Timeout (seconds) for executing generated code
+        debug_output_dir: Directory for storing debug artifacts (code, suggestions)
+        debug_store: Persistent store for debug experiences
+    Returns:
+        Dict with generation results
+    """
+    # Retrieve similar cases from memory
+    similar_cases = memory_bank.retrieve_similar_cases(problem_desc, top_k=top_k)
+    original_retrieved = len(similar_cases)
+    # Filter out identical descriptions (test set leakage)
+    if filter_perfect and similar_cases:
+        similar_cases = filter_perfect_matches(similar_cases, problem_desc)
+    # Prepare memory context
+    memory_context = ""
+    refined_insights = ""
+    if similar_cases:
+        if use_llm_refinement:
+            # Use LLM to analyze and refine the retrieved cases
+            print(f"  🧠 Using LLM to refine {len(similar_cases)} retrieved cases...")
+            refined_insights = refine_retrieved_cases_with_llm(
+                similar_cases, problem_desc, model, temperature=0.3
+            )
+            if refined_insights:
+                memory_context = f"""# Insights from Similar Problems in Memory
+Based on analysis of {len(similar_cases)} similar problems, here are key insights:
+{refined_insights}
+---
+Please use these insights to guide your modeling approach for the current problem.
+"""
+            else:
+                # Fallback to original formatting if refinement fails
+                memory_context = memory_bank.format_retrieved_cases_for_prompt(similar_cases)
+        else:
+            # Use original formatting (full cases)
+            memory_context = memory_bank.format_retrieved_cases_for_prompt(similar_cases)
+    # Build prompt with memory context
+    prompt_template = get_prompt_template("default")
+    system_prompt = prompt_template["system"]
+    user_prompt = prompt_template["user"].format(question=problem_desc)
+    # Inject memory context if available
+    if memory_context:
+        user_prompt = f"{memory_context}\n\n{user_prompt}"
+    # Generate solution with self-healing retry mechanism
+    full_prompt = f"{system_prompt}\n\n{user_prompt}"
+    # Calculate prompt length for monitoring
+    prompt_length = len(full_prompt)
+    prompt_tokens_estimate = prompt_length // 4  # Rough estimate: 1 token ≈ 4 chars
+    # Variables to track across attempts
+    attempt_history = []
+    final_response = ''
+    final_code = ''
+    execution_status = 'not_executed'
+    execution_stdout = ''
+    execution_stderr = ''
+    execution_objective = None
+    execution_returncode = None
+    suggestions_path = ''
+    executed_code_path = ''
+    debug_signature = ''
+    try:
+        # Self-healing loop: try up to max_retries times
+        current_prompt = full_prompt
+        for attempt in range(1, max_retries + 1):
+            print(f"  🔄 Attempt {attempt}/{max_retries} for problem {problem_id}")
+            # Generate code
+            response = get_response(current_prompt, model=model, temperature=temperature)
+            code = extract_python_code(response)
+            # Record this attempt
+            attempt_info = {
+                'attempt_number': attempt,
+                'response': response,
+                'code': code,
+                'execution_status': 'not_executed',
+            }
+            if auto_debug and code.strip():
+                target_dir = debug_output_dir or os.path.join(os.getcwd(), "auto_debug")
+                os.makedirs(target_dir, exist_ok=True)
+                # Execute the generated code
+                exec_result = execute_generated_code(
+                    code,
+                    problem_id,
+                    target_dir,
+                    timeout=execution_timeout,
+                )
+                # Update attempt info
+                attempt_info['execution_status'] = exec_result.status
+                attempt_info['objective_value'] = exec_result.objective_value
+                attempt_info['stdout'] = exec_result.stdout[:200] if exec_result.stdout else ''
+                attempt_info['stderr'] = exec_result.stderr[:200] if exec_result.stderr else ''
+                # Check if execution was successful
+                if exec_result.status == 'success':
+                    # Success! Use this result
+                    print(f"  ✅ Success on attempt {attempt}")
+                    execution_status = exec_result.status
+                    execution_stdout = exec_result.stdout
+                    execution_stderr = exec_result.stderr
+                    execution_objective = exec_result.objective_value
+                    execution_returncode = exec_result.returncode
+                    executed_code_path = exec_result.code_path or ''
+                    final_response = response
+                    final_code = code
+                    attempt_history.append(attempt_info)
+                    break  # Exit the retry loop
+                else:
+                    # Failure - prepare for retry
+                    print(f"  ❌ Failed on attempt {attempt}: {exec_result.status}")
+                    execution_status = exec_result.status
+                    execution_stdout = exec_result.stdout
+                    execution_stderr = exec_result.stderr
+                    execution_returncode = exec_result.returncode
+                    executed_code_path = exec_result.code_path or ''
+                    final_response = response
+                    final_code = code
+                    # Write debug report
+                    suggestions_path = write_debug_report(
+                        problem_id,
+                        problem_desc,
+                        exec_result,
+                        target_dir,
+                    )
+                    # Record to debug store
+                    error_message = execution_stderr or execution_stdout or execution_status
+                    if debug_store:
+                        debug_signature = debug_store.record_execution_feedback(
+                            problem_id=problem_id,
+                            description=problem_desc,
+                            status=execution_status,
+                            error_text=error_message,
+                            guidance=f"Attempt {attempt}/{max_retries} failed. Review the debug report.",
+                            source="generate_with_memory.auto_debug.self_healing",
+                            metadata={
+                                "attempt": attempt,
+                                "returncode": execution_returncode,
+                                "code_path": executed_code_path,
+                            },
+                        )
+                    attempt_history.append(attempt_info)
+                    # If not the last attempt, prepare retry prompt
+                    if attempt < max_retries:
+                        guidance_text = ""
+                        if debug_case_bank and error_message:
+                            debug_cases = debug_case_bank.retrieve_similar_cases(
+                                error_message,
+                                top_k=debug_case_top_k,
+                            )
+                            guidance_text = format_debug_cases_for_prompt(debug_cases)
+                        error_feedback = build_error_feedback_prompt(
+                            exec_result,
+                            attempt,
+                            code,
+                            debug_guidance=guidance_text,
+                        )
+                        # Append error feedback to the prompt for next attempt
+                        current_prompt = f"{full_prompt}\n\n{error_feedback}"
+                        print(f"  🔧 Preparing retry with error feedback...")
+                    else:
+                        print(f"  ⚠️  Max retries ({max_retries}) reached, giving up")
+            elif not code.strip():
+                # No code generated
+                attempt_info['execution_status'] = 'no_code'
+                attempt_history.append(attempt_info)
+                execution_status = 'no_code'
+                execution_stderr = 'Generated code block is empty.'
+                final_response = response
+                final_code = code
+                if attempt < max_retries:
+                    # Retry with feedback about missing code
+                    feedback = "\n\nYour previous response did not contain any Python code. Please provide the complete Gurobi code in a ```python``` code block."
+                    current_prompt = f"{full_prompt}\n\n{feedback}"
+                    print(f"  ⚠️  No code generated, retrying...")
+                else:
+                    print(f"  ⚠️  Max retries reached, no code generated")
+                    break
+            elif not auto_debug:
+                # Auto debug disabled, just use the generated code
+                execution_status = 'skipped'
+                final_response = response
+                final_code = code
+                attempt_history.append(attempt_info)
+                break
+        if auto_debug:
+            if execution_status == 'success':
+                final_status = 'success'
+            elif final_code.strip():
+                final_status = 'execution_failed'
+            else:
+                final_status = 'no_code'
+        else:
+            final_status = 'success' if final_code.strip() else 'no_code'
+        return {
+            'id': problem_id,
+            'model': model,
+            'temperature': temperature,
+            'description': problem_desc,
+            'full_input_prompt': full_prompt,  # 💾 Complete input for reproducibility
+            'refined_insights': refined_insights if use_llm_refinement else '',  # LLM-refined insights
+            'prompt_length_chars': prompt_length,
+            'prompt_length_tokens_est': prompt_tokens_estimate,
+            'raw_response': final_response,
+            'generated_code': final_code,
+            'retrieved_cases': len(similar_cases),
+            'original_retrieved': original_retrieved,
+            'use_llm_refinement': use_llm_refinement,
+            'status': final_status,
+            'execution_status': execution_status,
+            'execution_stdout': execution_stdout,
+            'execution_stderr': execution_stderr,
+            'execution_objective_value': execution_objective,
+            'execution_returncode': execution_returncode,
+            'debug_suggestions_path': suggestions_path,
+            'executed_code_path': executed_code_path if executed_code_path else '',
+            'debug_signature': debug_signature,
+            'auto_debug_enabled': auto_debug,
+            'execution_timeout_sec': execution_timeout if auto_debug else None,
+            'max_retries': max_retries,
+            'total_attempts': len(attempt_history),
+            'attempt_history': attempt_history,
+            'self_healing_enabled': True,
+        }
+    except Exception as e:
+        print(f"Error generating solution for problem {problem_id}: {e}")
+        # Still save the prompt even on error
+        full_prompt = f"{system_prompt}\n\n{user_prompt}"
+        return {
+            'id': problem_id,
+            'model': model,
+            'temperature': temperature,
+            'description': problem_desc,
+            'full_input_prompt': full_prompt,  # Save even on error
+            'refined_insights': '',
+            'prompt_length_chars': len(full_prompt),
+            'prompt_length_tokens_est': len(full_prompt) // 4,
+            'raw_response': '',
+            'generated_code': '',
+            'retrieved_cases': len(similar_cases) if similar_cases else 0,
+            'original_retrieved': original_retrieved,
+            'use_llm_refinement': use_llm_refinement,
+            'status': 'error',
+            'error': str(e),
+            'execution_status': 'not_executed',
+            'execution_stdout': '',
+            'execution_stderr': '',
+            'execution_objective_value': None,
+            'execution_returncode': None,
+            'debug_suggestions_path': '',
+            'executed_code_path': '',
+            'debug_signature': '',
+            'auto_debug_enabled': auto_debug,
+            'execution_timeout_sec': execution_timeout if auto_debug else None,
+            'max_retries': max_retries,
+            'total_attempts': 0,
+            'attempt_history': [],
+            'self_healing_enabled': True,
+        }
+def generate_single_problem(
+    problem: Dict,
+    memory_bank: MemoryBank,
+    model: str,
+    temperature: float,
+    top_k: int,
+    filter_perfect: bool,
+    use_llm_refinement: bool,
+    *,
+    auto_debug: bool,
+    execution_timeout: int,
+    debug_output_dir: Optional[str],
+    debug_store: Optional[DebugMemoryStore],
+    max_retries: int = 3,
+    debug_case_bank: Optional[MemoryBank] = None,
+    debug_case_top_k: int = 3,
+) -> Dict:
+    """
+    Wrapper for parallel execution
+    """
+    problem_id = problem['id']
+    problem_desc = problem['description']
+    result = generate_with_memory(
+        problem_id, problem_desc, memory_bank,
+        model, temperature, top_k, filter_perfect, use_llm_refinement,
+        auto_debug=auto_debug,
+        execution_timeout=execution_timeout,
+        debug_output_dir=debug_output_dir,
+        debug_store=debug_store,
+        max_retries=max_retries,
+        debug_case_bank=debug_case_bank,
+        debug_case_top_k=debug_case_top_k,
+    )
+    # Add ground truth
+    result['answer'] = problem.get('answer', '')
+    return result
+def main():
+    parser = argparse.ArgumentParser(description='Generate with Memory (parallel single solutions)')
+    parser.add_argument('--dataset', type=str, default='IndustryOR',
+                       help='Dataset name')
+    parser.add_argument('--model', type=str, default='gpt-4o',
+                       help='Model name')
+    parser.add_argument('--temperature', type=float, default=0.01,
+                       help='Temperature for generation')
+    parser.add_argument('--max_problems', type=int, default=None,
+                       help='Maximum number of problems to process')
+    parser.add_argument('--output', type=str, required=True,
+                       help='Output file path (JSONL)')
+    parser.add_argument('--memory_dir', type=str, default=str(DEFAULT_MEMORY_DIR),
+                       help='Memory storage directory')
+    parser.add_argument('--embedding_model', type=str, default=None,
+                       help='Optional embedding model name or local path for memory retrieval')
+    parser.add_argument('--memory_top_k', type=int, default=4,
+                       help='Number of cases to retrieve from memory (default: 4)')
+    parser.add_argument('--no_filter_perfect', action='store_true',
+                       help='Disable filtering of perfect similarity matches')
+    parser.add_argument('--use_llm_refinement', action='store_true',
+                       help='Use LLM to refine/summarize retrieved cases (improves quality, costs more API calls)')
+    parser.add_argument('--parallel', type=int, default=5,
+                       help='Number of parallel workers')
+    parser.add_argument('--execution_timeout', type=int, default=120,
+                       help='Timeout (seconds) for executing generated code during auto-debug')
+    parser.add_argument('--no_auto_debug', action='store_true',
+                       help='Disable automatic execution and debug capture for generated code')
+    parser.add_argument('--debug_output_dir', type=str, default=None,
+                       help='Directory to store auto-debug artifacts (code, logs, suggestions)')
+    parser.add_argument('--debug_memory_path', type=str, default=str(DEFAULT_DEBUG_MEMORY),
+                       help='Path to persistent debug memory JSONL file')
+    parser.add_argument('--debug_case_memory_dir', type=str, default=str(DEFAULT_DEBUG_CASE_MEMORY),
+                       help='Directory containing consolidated debug-case memory (built via build_debug_memory.py)')
+    parser.add_argument('--debug_case_memory_top_k', type=int, default=3,
+                       help='How many debug memory cases to retrieve when execution fails')
+    parser.add_argument('--max_retries', type=int, default=3,
+                       help='Maximum number of retry attempts for self-healing (default: 3)')
+    args = parser.parse_args()
+    args.dataset = normalize_dataset_name(args.dataset)
+    auto_debug_enabled = not args.no_auto_debug
+    debug_output_dir = args.debug_output_dir
+    debug_store: Optional[DebugMemoryStore] = None
+    if auto_debug_enabled:
+        if debug_output_dir is None:
+            base_dir = os.path.dirname(args.output) or '.'
+            debug_output_dir = os.path.join(base_dir, 'auto_debug')
+        os.makedirs(debug_output_dir, exist_ok=True)
+        debug_store = DebugMemoryStore(args.debug_memory_path)
+    else:
+        debug_output_dir = None
+    debug_case_bank: Optional[MemoryBank] = None
+    if auto_debug_enabled and args.debug_case_memory_top_k > 0 and args.debug_case_memory_dir:
+        case_dir = Path(args.debug_case_memory_dir)
+        if case_dir.exists():
+            try:
+                if args.embedding_model:
+                    debug_case_bank = MemoryBank(
+                        memory_dir=str(case_dir),
+                        embedding_model=args.embedding_model,
+                    )
+                else:
+                    debug_case_bank = MemoryBank(memory_dir=str(case_dir))
+            except Exception as exc:  # noqa: BLE001
+                print(f"⚠️  Warning: failed to load debug-case memory from {case_dir} ({exc})")
+        else:
+            print(f"ℹ️  Debug-case memory directory not found: {case_dir} (skipping retrieval)")
+    print("="*80)
+    print("🧠 Generate with Memory (Parallel)")
+    print("="*80)
+    print(f"Dataset:      {args.dataset}")
+    print(f"Model:        {args.model}")
+    print(f"Temperature:  {args.temperature}")
+    print(f"Memory dir:   {args.memory_dir}")
+    if args.embedding_model:
+        print(f"Embedding:    {args.embedding_model}")
+    print(f"Memory Top-K: {args.memory_top_k}")
+    print(f"Filter perfect matches: {not args.no_filter_perfect}")
+    print(f"LLM Refinement: {'✅ Enabled' if args.use_llm_refinement else '❌ Disabled'}")
+    print(f"Parallel:     {args.parallel}")
+    print(f"Output:       {args.output}")
+    print(f"Auto Debug:   {'✅ Enabled' if auto_debug_enabled else '❌ Disabled'}")
+    if auto_debug_enabled:
+        print(f"  Debug dir:      {debug_output_dir}")
+        if args.debug_memory_path:
+            print(f"  Debug memory:   {args.debug_memory_path}")
+        print(f"  Exec timeout:   {args.execution_timeout}s")
+        print(f"  Max retries:    {args.max_retries} (Self-healing enabled)")
+    print("="*80)
+    print()
+    # Initialize memory bank only when retrieval is active.
+    if args.memory_top_k > 0:
+        print("Initializing memory bank...")
+        if args.embedding_model:
+            memory_bank = MemoryBank(memory_dir=args.memory_dir, embedding_model=args.embedding_model)
+        else:
+            memory_bank = MemoryBank(memory_dir=args.memory_dir)
+        print()
+    else:
+        print("Skipping memory bank initialization because memory_top_k=0")
+        print()
+        memory_bank = NoOpMemoryBank()
+    # Load dataset
+    problems = load_dataset(args.dataset)
+    if args.max_problems:
+        problems = problems[:args.max_problems]
+    print(f"Processing {len(problems)} problems with {args.parallel} workers")
+    print()
+    # Create output directory
+    os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True)
+    # Parallel generation
+    results = []
+    if args.parallel <= 1:
+        # Sequential processing
+        for problem in tqdm(problems, desc="Generating"):
+            result = generate_single_problem(
+                problem, memory_bank, args.model, args.temperature,
+                args.memory_top_k, not args.no_filter_perfect, args.use_llm_refinement,
+                auto_debug=auto_debug_enabled,
+                execution_timeout=args.execution_timeout,
+                debug_output_dir=debug_output_dir,
+                debug_store=debug_store,
+                max_retries=args.max_retries,
+                debug_case_bank=debug_case_bank,
+                debug_case_top_k=args.debug_case_memory_top_k,
+            )
+            results.append(result)
+    else:
+        # Parallel processing
+        with ThreadPoolExecutor(max_workers=args.parallel) as executor:
+            futures = {
+                executor.submit(
+                    generate_single_problem,
+                    problem, memory_bank, args.model, args.temperature,
+                    args.memory_top_k, not args.no_filter_perfect, args.use_llm_refinement,
+                    auto_debug=auto_debug_enabled,
+                    execution_timeout=args.execution_timeout,
+                    debug_output_dir=debug_output_dir,
+                    debug_store=debug_store,
+                    max_retries=args.max_retries,
+                    debug_case_bank=debug_case_bank,
+                    debug_case_top_k=args.debug_case_memory_top_k,
+                ): problem for problem in problems
+            }
+            for future in tqdm(as_completed(futures), total=len(problems), desc="Generating"):
+                try:
+                    result = future.result()
+                    results.append(result)
+                except Exception as e:
+                    problem = futures[future]
+                    print(f"Error processing problem {problem['id']}: {e}")
+    # Sort by problem ID
+    results.sort(key=lambda x: x['id'])
+    # Save results
+    with open(args.output, 'w', encoding='utf-8') as f:
+        for result in results:
+            f.write(json.dumps(result, ensure_ascii=False) + '\n')
+    print()
+    print("="*80)
+    print("✅ Generation Complete")
+    print("="*80)
+    print(f"Total problems: {len(results)}")
+    status_counts = Counter(r.get('status', 'unknown') for r in results)
+    print(f"Successful:     {status_counts.get('success', 0)}")
+    print(f"Errors:         {status_counts.get('error', 0)}")
+    print(f"Results saved to: {args.output}")
+    if status_counts:
+        print("Status breakdown:")
+        for status, count in sorted(status_counts.items()):
+            print(f"  {status:<18}: {count}")
+    # Memory statistics
+    total_retrieved = sum(r.get('retrieved_cases', 0) for r in results)
+    total_original = sum(r.get('original_retrieved', 0) for r in results)
+    filtered = total_original - total_retrieved
+    # Prompt length statistics
+    prompt_lengths = [r.get('prompt_length_tokens_est', 0) for r in results if r.get('status') == 'success']
+    avg_prompt_tokens = sum(prompt_lengths) / len(prompt_lengths) if prompt_lengths else 0
+    max_prompt_tokens = max(prompt_lengths) if prompt_lengths else 0
+    print()
+    print("Memory Statistics:")
+    print(f"  Total retrievals: {total_original}")
+    print(f"  After filtering:  {total_retrieved}")
+    print(f"  Filtered out:     {filtered} (perfect matches)")
+    print(f"  Avg per problem:  {total_retrieved / len(results):.2f}")
+    print()
+    print("Prompt Length Statistics:")
+    print(f"  Avg prompt tokens: {avg_prompt_tokens:.0f}")
+    print(f"  Max prompt tokens: {max_prompt_tokens:.0f}")
+    print(f"  ℹ️  All prompts saved in 'full_input_prompt' field")
+    print("="*80)
+if __name__ == "__main__":
+    main()

src/debate_memory/llm.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Lightweight HTTP client for OpenAI-compatible chat completions.
+- Credentials are read from environment variables only.
+- Supported environment variables:
+    * `LLM_API_BASE_URL`
+    * `LLM_API_KEY`
+    * `OPENAI_BASE_URL`
+    * `OPENAI_API_KEY`
+    * `API_URL`
+    * `API_KEY`
+"""
+from __future__ import annotations
+import json
+import os
+import time
+from typing import Dict, List
+import requests
+def _get_credentials() -> Dict[str, str]:
+    api_key = (
+        os.getenv("LLM_API_KEY")
+        or os.getenv("OPENAI_API_KEY")
+        or os.getenv("API_KEY")
+    )
+    base_url = (
+        os.getenv("LLM_API_BASE_URL")
+        or os.getenv("OPENAI_BASE_URL")
+        or os.getenv("API_URL")
+    )
+    if not api_key:
+        raise RuntimeError(
+            "Missing API key. Set one of: LLM_API_KEY, OPENAI_API_KEY, API_KEY."
+        )
+    if not base_url:
+        raise RuntimeError(
+            "Missing API base URL. Set one of: "
+            "LLM_API_BASE_URL, OPENAI_BASE_URL, API_URL."
+        )
+    return {"api_key": api_key, "base_url": base_url.rstrip("/")}
+def _post_chat_completion(
+    messages: List[Dict[str, str]],
+    model: str,
+    temperature: float,
+    max_tokens: int,
+) -> Dict:
+    creds = _get_credentials()
+    url = f"{creds['base_url']}/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {creds['api_key']}",
+        "Content-Type": "application/json",
+    }
+    payload = {
+        "model": model,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }
+    response = requests.post(url, headers=headers, json=payload, timeout=120)
+    response.raise_for_status()
+    try:
+        return response.json()
+    except json.JSONDecodeError as exc:  # pragma: no cover - defensive
+        raise RuntimeError(f"Non-JSON response from LLM API: {response.text[:200]}") from exc
+def _extract_content(result: Dict) -> str:
+    choices = result.get("choices")
+    if not choices:
+        raise RuntimeError(f"LLM API response missing 'choices': {result}")
+    message = choices[0].get("message") or {}
+    content = message.get("content")
+    if content is None:
+        raise RuntimeError(f"LLM API response missing message content: {result}")
+    return content
+def get_response(prompt: str, model: str, temperature: float = 0.01, maximum_retries: int = 10) -> str:
+    """
+    Send a chat completion request using OpenAI-compatible REST calls.
+    """
+    if model.startswith("deepseek"):
+        real_model = model.replace("-chat", "-v3").replace("-reasoner", "-r1")
+    else:
+        real_model = model
+    attempts = max(1, maximum_retries)
+    last_error: Exception | None = None
+    while attempts > 0:
+        try:
+            result = _post_chat_completion(
+                messages=[{"role": "user", "content": prompt}],
+                model=real_model,
+                temperature=temperature,
+                max_tokens=16384,
+            )
+            return _extract_content(result)
+        except Exception as exc:  # noqa: BLE001
+            last_error = exc
+            attempts -= 1
+            if attempts == 0:
+                break
+            print(f"Error using API: {exc}. Retrying...")
+            time.sleep(2)
+    raise RuntimeError(f"Failed to get response from API after retries: {last_error}")

src/debate_memory/memory_bank.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Memory Bank for storing and retrieving successful problem-solving cases
+Uses LlamaIndex for RAG-based case retrieval
+"""
+import os
+import json
+from pathlib import Path
+from typing import List, Dict, Optional
+from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import Settings
+_PKG_DIR = Path(__file__).resolve().parent
+_PROJECT_ROOT = _PKG_DIR.parent.parent
+DEFAULT_MEMORY_DIR = str(_PROJECT_ROOT / "memory_storage")
+class MemoryBank:
+    """
+    Memory Bank for storing successful problem-solving experiences
+    Design inspired by Memento (https://arxiv.org/pdf/2508.16153):
+    - Episodic memory: Store past successful trajectories
+    - Case-based reasoning: Retrieve similar cases to guide current problem
+    - Non-parametric: No gradient updates, just memory read/write
+    """
+    def __init__(self, memory_dir: str = DEFAULT_MEMORY_DIR, embedding_model: str = "BAAI/bge-small-en-v1.5"):
+        """
+        Initialize Memory Bank
+        Args:
+            memory_dir: Directory to store memory index and cases
+            embedding_model: HuggingFace embedding model name or local path
+        """
+        self.memory_dir = memory_dir
+        os.makedirs(memory_dir, exist_ok=True)
+        self.cases_file = os.path.join(memory_dir, "cases.jsonl")
+        self.index_dir = os.path.join(memory_dir, "index")
+        # Configure embedding model with local caching
+        # Set cache_folder to use llama_index's cache directory
+        # Set trust_remote_code to False for security
+        # If embedding_model is a local path, use it directly
+        # Otherwise, try to use cached model to avoid network requests
+        os.environ.setdefault("HF_HUB_OFFLINE", "0")  # Allow online access by default
+        # Check if embedding_model is a local file path
+        is_local_path = os.path.isabs(embedding_model) or (os.path.sep in embedding_model and os.path.exists(embedding_model))
+        try:
+            # If it's a local path, use it directly
+            if is_local_path:
+                print(f"📁 Using local embedding model from: {embedding_model}")
+                Settings.embed_model = HuggingFaceEmbedding(
+                    model_name=embedding_model,
+                    cache_folder=os.path.expanduser("~/.cache/llama_index"),
+                    trust_remote_code=False
+                )
+            else:
+                # Try to load from cache first to avoid network requests
+                # Set HF_HUB_OFFLINE=1 to force local-only mode
+                print(f"🔍 Loading embedding model: {embedding_model}")
+                print("   (If you want to avoid Hugging Face downloads, set HF_HUB_OFFLINE=1 or use a local model path)")
+                Settings.embed_model = HuggingFaceEmbedding(
+                    model_name=embedding_model,
+                    cache_folder=os.path.expanduser("~/.cache/llama_index"),
+                    trust_remote_code=False
+                )
+        except Exception as e:
+            # If model loading fails, try to use cached model only
+            print(f"⚠️  Warning: Failed to load embedding model '{embedding_model}': {e}")
+            print("   Attempting to use cached model only (setting HF_HUB_OFFLINE=1)...")
+            os.environ["HF_HUB_OFFLINE"] = "1"
+            try:
+                Settings.embed_model = HuggingFaceEmbedding(
+                    model_name=embedding_model,
+                    cache_folder=os.path.expanduser("~/.cache/llama_index"),
+                    trust_remote_code=False
+                )
+                print("   ✅ Using cached model")
+            except Exception as e2:
+                print(f"❌ Error: Could not load embedding model: {e2}")
+                print("   Please either:")
+                print("   1. Download the model first: python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')\"")
+                print("   2. Set HF_HUB_OFFLINE=1 and ensure the model is cached")
+                print("   3. Use a local model path: --embedding_model /path/to/local/model")
+                raise
+        # Disable chunking to ensure one document = one node (no duplicates)
+        Settings.chunk_size = 8192  # Large enough to never split
+        Settings.chunk_overlap = 0
+        # Load or create index
+        self.index = self._load_or_create_index()
+        self.case_count = self._count_cases()
+        print(f"Memory Bank initialized with {self.case_count} cases")
+    def _load_or_create_index(self):
+        """Load existing index or create new one"""
+        if os.path.exists(self.index_dir):
+            try:
+                storage_context = StorageContext.from_defaults(persist_dir=self.index_dir)
+                index = load_index_from_storage(storage_context)
+                print(f"Loaded existing memory index from {self.index_dir}")
+                return index
+            except:
+                print("Failed to load index, creating new one")
+        # Create new empty index
+        documents = []
+        index = VectorStoreIndex.from_documents(documents)
+        os.makedirs(self.index_dir, exist_ok=True)
+        index.storage_context.persist(persist_dir=self.index_dir)
+        print(f"Created new memory index at {self.index_dir}")
+        return index
+    def _count_cases(self) -> int:
+        """Count number of cases in memory"""
+        if not os.path.exists(self.cases_file):
+            return 0
+        with open(self.cases_file, 'r') as f:
+            return sum(1 for _ in f)
+    def add_case(self, problem_id: int, problem_desc: str, solution_code: str,
+                 objective_value: float, is_correct: bool, metadata: Optional[Dict] = None):
+        """
+        Add a successful case to memory
+        Args:
+            problem_id: Problem ID
+            problem_desc: Problem description
+            solution_code: Solution code
+            objective_value: Computed objective value
+            is_correct: Whether the solution is correct
+            metadata: Additional metadata (model, debate_rounds, etc.)
+        """
+        if not is_correct:
+            # Only store successful cases
+            return
+        case = {
+            'problem_id': problem_id,
+            'description': problem_desc,
+            'solution_code': solution_code,
+            'objective_value': objective_value,
+            'is_correct': is_correct,
+            'metadata': metadata or {}
+        }
+        # Write to cases file
+        with open(self.cases_file, 'a', encoding='utf-8') as f:
+            f.write(json.dumps(case, ensure_ascii=False) + '\n')
+        # Create document for indexing
+        # Combine description and key solution insights for better retrieval
+        doc_text = f"""Problem: {problem_desc}
+Solution approach:
+{solution_code[:500]}...
+Key features:
+- Problem ID: {problem_id}
+- Objective value: {objective_value}
+- Status: Correct
+"""
+        doc = Document(
+            text=doc_text,
+            metadata={
+                'problem_id': problem_id,
+                'objective_value': objective_value,
+                **case['metadata']
+            }
+        )
+        # Add to index
+        self.index.insert(doc)
+        self.index.storage_context.persist(persist_dir=self.index_dir)
+        self.case_count += 1
+        print(f"✅ Added case {problem_id} to memory (Total: {self.case_count})")
+    def retrieve_similar_cases(self, query: str, top_k: int = 3, preferred_dataset: Optional[str] = None) -> List[Dict]:
+        """
+        Retrieve similar cases from memory using RAG based on semantic similarity
+        Args:
+            query: Query text (usually the problem description)
+            top_k: Number of similar cases to retrieve (0 = no retrieval)
+            preferred_dataset: Preferred dataset name to prioritize (optional)
+        Returns:
+            List of similar cases with scores, sorted by semantic similarity
+        """
+        if self.case_count == 0 or top_k <= 0:
+            return []
+        # Query the index - purely based on semantic similarity
+        retriever = self.index.as_retriever(similarity_top_k=top_k * 2 if preferred_dataset else top_k)
+        nodes = retriever.retrieve(query)
+        # Load corresponding cases from cases.jsonl based on semantic similarity
+        similar_cases = []
+        seen_keys = set()  # Track which (problem_id, dataset) combinations we've added
+        # If preferred_dataset is specified, prioritize those cases
+        preferred_cases = []
+        other_cases = []
+        for node in nodes:
+            problem_id = node.metadata.get('problem_id')
+            score = node.score
+            node_dataset = node.metadata.get('dataset', '')
+            # Build key for deduplication
+            case_key = (problem_id, node_dataset)
+            if case_key in seen_keys:
+                continue
+            # Load the case - use dataset from node metadata to get the exact match
+            case_data = None
+            if node_dataset:
+                # Try to load by problem_id and dataset (more precise)
+                case_data = self._load_case_by_id_and_dataset(problem_id, node_dataset)
+            if not case_data:
+                # Fallback: try to load by problem_id only
+                case_data = self._load_case_by_id(problem_id)
+            if case_data:
+                seen_keys.add(case_key)
+                case_item = {
+                    'case': case_data,
+                    'score': score,
+                    'text_preview': node.text[:200]
+                }
+                # Separate preferred dataset cases from others
+                if preferred_dataset and node_dataset == preferred_dataset:
+                    preferred_cases.append(case_item)
+                else:
+                    other_cases.append(case_item)
+        # Combine: preferred cases first, then others, all sorted by similarity score
+        similar_cases = preferred_cases + other_cases
+        # Return top_k results
+        return similar_cases[:top_k]
+    def _load_case_by_id(self, problem_id: int) -> Optional[Dict]:
+        """Load a specific case by problem ID (returns first match)"""
+        if not os.path.exists(self.cases_file):
+            return None
+        with open(self.cases_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                case = json.loads(line)
+                if case['problem_id'] == problem_id:
+                    return case
+        return None
+    def _load_case_by_id_and_dataset(self, problem_id: int, dataset: str) -> Optional[Dict]:
+        """Load a specific case by problem ID and dataset"""
+        if not os.path.exists(self.cases_file):
+            return None
+        with open(self.cases_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                case = json.loads(line)
+                if case['problem_id'] == problem_id:
+                    case_dataset = case.get('metadata', {}).get('dataset', '')
+                    if case_dataset == dataset:
+                        return case
+        return None
+    def get_memory_stats(self) -> Dict:
+        """Get memory bank statistics"""
+        return {
+            'total_cases': self.case_count,
+            'memory_dir': self.memory_dir,
+            'cases_file': self.cases_file,
+            'index_dir': self.index_dir
+        }
+    def format_retrieved_cases_for_prompt(self, cases: List[Dict]) -> str:
+        """
+        Format retrieved cases for inclusion in LLM prompt
+        Args:
+            cases: List of retrieved cases
+        Returns:
+            Formatted string for prompt
+        """
+        if not cases:
+            return ""
+        prompt = "# Retrieved Similar Cases from Memory\n\n"
+        prompt += "The following successful cases from previous problems might be relevant:\n\n"
+        for i, item in enumerate(cases, 1):
+            case = item['case']
+            score = item['score']
+            prompt += f"## Case {i} (Similarity: {score:.3f})\n"
+            prompt += f"**Problem:** {case['description']}\n\n"
+            prompt += f"**Solution approach:**\n```python\n{case['solution_code']}\n```\n\n"
+            prompt += f"**Result:** Objective value = {case['objective_value']}, Status = Correct\n\n"
+            prompt += "---\n\n"
+        return prompt

src/debate_memory/memory_intelligence.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# -*- coding: utf-8 -*-
+"""
+Lightweight helpers for categorising optimisation problems and surfacing
+category-level memory.
+"""
+from __future__ import annotations
+import json
+import os
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, Iterable, List, Set, Tuple
+_PKG_DIR = Path(__file__).resolve().parent
+_PROJECT_ROOT = _PKG_DIR.parent.parent
+DEFAULT_GUIDELINE_PATH = str(_PROJECT_ROOT / "memory_storage" / "category_guidelines.jsonl")
+class MemoryIntelligence:
+    """
+    Heuristic problem classifier + guideline loader.
+    The goal is to offer fast, rule-based categorisation that can run
+    offline. If the heuristics fail, downstream agents (LLMs) can still
+    append tags, but we always return the heuristic view for consistency.
+    """
+    CATEGORY_KEYWORDS: Dict[str, Set[str]] = {
+        "workforce_planning": {
+            "worker",
+            "workforce",
+            "training",
+            "trainee",
+            "overtime",
+            "hire",
+            "fire",
+        },
+        "inventory_planning": {
+            "inventory",
+            "backlog",
+            "stock",
+            "warehouse",
+            "storage",
+            "holding cost",
+        },
+        "production_planning": {
+            "production",
+            "factory",
+            "capacity",
+            "machine",
+            "batch",
+            "demand",
+        },
+        "scheduling": {
+            "schedule",
+            "sequencing",
+            "precedence",
+            "flow shop",
+            "job shop",
+            "makespan",
+        },
+        "transportation": {
+            "transport",
+            "shipping",
+            "vehicle",
+            "route",
+            "delivery",
+            "supply",
+            "demand",
+            "shipment",
+        },
+        "network_flow": {
+            "flow",
+            "arc",
+            "network",
+            "node",
+            "capacity",
+            "supply node",
+            "demand node",
+        },
+        "assignment": {
+            "assignment",
+            "allocate",
+            "task",
+            "agent",
+            "matching",
+            "job",
+        },
+        "facility_location": {
+            "facility",
+            "location",
+            "plant",
+            "open",
+            "siting",
+            "distribution center",
+        },
+        "traveling_salesman": {
+            "tsp",
+            "tour",
+            "city",
+            "travel",
+            "route visiting",
+            "cyclic",
+        },
+        "portfolio_optimization": {
+            "portfolio",
+            "investment",
+            "asset",
+            "return",
+            "risk",
+            "variance",
+        },
+    }
+    def __init__(self, guideline_path: str = DEFAULT_GUIDELINE_PATH):
+        self.guideline_path = guideline_path
+        self.guidelines = self._load_guidelines(guideline_path)
+    @staticmethod
+    def _load_guidelines(path: str) -> Dict[str, Dict]:
+        guidelines: Dict[str, Dict] = {}
+        if not path or not os.path.exists(path):
+            return guidelines
+        with open(path, "r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    payload = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                category = payload.get("category")
+                if not category:
+                    continue
+                guidelines[category] = payload
+        return guidelines
+    def classify(self, description: str, top_k: int = 3, minimum_score: int = 1) -> List[Tuple[str, int]]:
+        """
+        Return a ranked list of (category, score) using keyword heuristics.
+        """
+        if not description:
+            return []
+        text = description.lower()
+        scores: Dict[str, int] = defaultdict(int)
+        for category, keywords in self.CATEGORY_KEYWORDS.items():
+            for keyword in keywords:
+                occurrences = len(re.findall(r"\b" + re.escape(keyword.lower()) + r"\b", text))
+                if occurrences:
+                    scores[category] += occurrences
+        ranked = sorted(scores.items(), key=lambda item: item[1], reverse=True)
+        filtered = [(cat, score) for cat, score in ranked if score >= minimum_score]
+        if top_k:
+            return filtered[:top_k]
+        return filtered
+    def categories_only(self, description: str, top_k: int = 3, minimum_score: int = 1) -> List[str]:
+        return [cat for cat, _ in self.classify(description, top_k=top_k, minimum_score=minimum_score)]
+    def guideline_text(
+        self,
+        categories: Iterable[str],
+        include_header: bool = True,
+        max_items_per_category: int = 4,
+    ) -> str:
+        """
+        Render guidelines for the provided categories as a markdown string.
+        """
+        categories = list(dict.fromkeys(categories))  # deduplicate while preserving order
+        if not categories:
+            return ""
+        lines: List[str] = []
+        if include_header:
+            lines.append("# Category Playbook")
+            lines.append("")
+        for category in categories:
+            entry = self.guidelines.get(category)
+            if not entry:
+                continue
+            title = entry.get("title") or category.replace("_", " ").title()
+            lines.append(f"## {title}")
+            guidelines = entry.get("guidelines") or []
+            if not guidelines:
+                continue
+            for bullet in guidelines[:max_items_per_category]:
+                lines.append(f"- {bullet}")
+            lines.append("")
+        return "\n".join(lines).strip()
+    def guideline_bullets(self, categories: Iterable[str], max_items_per_category: int = 4) -> List[str]:
+        bullets: List[str] = []
+        for category in categories:
+            entry = self.guidelines.get(category)
+            if not entry:
+                continue
+            title = entry.get("title") or category.replace("_", " ").title()
+            guidelines = entry.get("guidelines") or []
+            for item in guidelines[:max_items_per_category]:
+                bullets.append(f"{title}: {item}")
+        return bullets
+__all__ = ["MemoryIntelligence", "DEFAULT_GUIDELINE_PATH"]

src/debate_memory/run_memory_debate.py ADDED Viewed

	@@ -0,0 +1,580 @@

+#!/usr/bin/env python3
+"""
+Run debates between two models using memory-augmented single generations.
+This script automatically locates the latest initial-solution files for the
+specified models, runs the parallel debate workflow from `simple_rag/debate.py`,
+and then evaluates the consensus solutions with `execute.py`.
+Example:
+    python run_memory_debate.py \
+        --datasets ComplexLP EasyLP \
+        --max_rounds 3 \
+        --debate_workers 16 \
+        --execute_workers 128
+"""
+from __future__ import annotations
+import argparse
+import datetime as dt
+import glob
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from .config import normalize_dataset_name
+SCRIPT_DIR = Path(__file__).resolve().parent
+SRC_DIR = SCRIPT_DIR.parent
+PROJECT_ROOT = SRC_DIR.parent
+MONOREPO_ROOT = PROJECT_ROOT.parent
+STANDARD_RESULTS_ROOT = PROJECT_ROOT.parent.parent / "results" / "Agora-Opt"
+DEFAULT_RESULTS_DIR = STANDARD_RESULTS_ROOT / "generation"
+DEFAULT_OUTPUT_ROOT = STANDARD_RESULTS_ROOT / "debate"
+DEFAULT_DEBATE_SCRIPT = MONOREPO_ROOT / "simple_rag" / "debate.py"
+DEFAULT_EXECUTE_SCRIPT = PROJECT_ROOT / "scripts" / "execute.py"
+DEFAULT_DEBATE_MEMORY_DIR = PROJECT_ROOT / "debate_memory_storage"
+DEBATE_MEMORY_HEADER = "# Debate Memory Insights"
+from .memory_bank import MemoryBank
+def format_debate_memory_context(cases: List[Dict]) -> str:
+    if not cases:
+        return ""
+    lines = [DEBATE_MEMORY_HEADER, ""]
+    for idx, item in enumerate(cases, 1):
+        case = item["case"]
+        score = item.get("score", 0.0)
+        metadata = case.get("metadata", {})
+        dataset = metadata.get("dataset", "unknown")
+        summary = metadata.get("summary", {}).get("summary")
+        lines.append(f"## Case {idx} (similarity {score:.3f}, dataset {dataset})")
+        description = case.get("description", "").strip()
+        if description:
+            snippet = description if len(description) <= 800 else description[:800] + "\n..."
+            lines.append(snippet)
+        if summary:
+            lines.append("Summary: " + summary)
+        lines.append("---")
+    return "\n".join(lines).strip()
+def build_debate_memory_contexts(
+    files: List[str],
+    debate_memory: MemoryBank,
+    dataset: str,
+    top_k: int,
+) -> Dict[int, str]:
+    contexts: Dict[int, str] = {}
+    if debate_memory is None or top_k <= 0:
+        return contexts
+    for file_path in files:
+        with open(file_path, "r", encoding="utf-8") as fh:
+            for line in fh:
+                if not line.strip():
+                    continue
+                data = json.loads(line)
+                problem_id = data.get("id")
+                if problem_id is None or problem_id in contexts:
+                    continue
+                description = data.get("description", "")
+                if not description.strip():
+                    contexts[problem_id] = ""
+                    continue
+                cases = debate_memory.retrieve_similar_cases(
+                    description,
+                    top_k=top_k,
+                    preferred_dataset=dataset,
+                )
+                contexts[problem_id] = format_debate_memory_context(cases)
+    return contexts
+def maybe_enrich_generation_file(
+    source_path: str,
+    destination_path: str,
+    contexts: Dict[int, str],
+) -> str:
+    if not contexts:
+        return source_path
+    changed = False
+    enriched_lines: List[str] = []
+    with open(source_path, "r", encoding="utf-8") as fh:
+        for line in fh:
+            if not line.strip():
+                continue
+            data = json.loads(line)
+            pid = data.get("id")
+            context = contexts.get(pid)
+            if context:
+                data["description"] = f"{data.get('description', '').strip()}\n\n{context}"
+                changed = True
+            enriched_lines.append(json.dumps(data, ensure_ascii=False))
+    if not changed:
+        return source_path
+    with open(destination_path, "w", encoding="utf-8") as fh:
+        for entry in enriched_lines:
+            fh.write(entry + "\n")
+    return destination_path
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Parallel debate runner for memory-enhanced single generations"
+    )
+    parser.add_argument(
+        "--modelA",
+        type=str,
+        default="gpt-4o",
+        help="First model in the debate (default: gpt-4o)",
+    )
+    parser.add_argument(
+        "--modelB",
+        type=str,
+        default="deepseek-chat",
+        help="Second model in the debate (default: deepseek-chat)",
+    )
+    parser.add_argument(
+        "--results_dir",
+        type=str,
+        default=str(DEFAULT_RESULTS_DIR),
+        help="Directory that stores initial-solution JSONL files",
+    )
+    parser.add_argument(
+        "--datasets",
+        nargs="*",
+        default=None,
+        help="Datasets to debate. If omitted, auto-detect common datasets.",
+    )
+    parser.add_argument(
+        "--output_root",
+        type=str,
+        default=str(DEFAULT_OUTPUT_ROOT),
+        help="Root directory to store debate/eval outputs",
+    )
+    parser.add_argument(
+        "--debate_script",
+        type=str,
+        default=str(DEFAULT_DEBATE_SCRIPT),
+        help="Path to simple_rag/debate.py (override if needed)",
+    )
+    parser.add_argument(
+        "--execute_script",
+        type=str,
+        default=str(DEFAULT_EXECUTE_SCRIPT),
+        help="Path to debate_with_memory/execute.py (override if needed)",
+    )
+    parser.add_argument(
+        "--max_rounds",
+        type=int,
+        default=3,
+        help="Maximum number of debate rounds (default: 3)",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.01,
+        help="Temperature for debate LLM calls (default: 0.01)",
+    )
+    parser.add_argument(
+        "--debate_workers",
+        type=int,
+        default=16,
+        help="Parallel workers for debate (ThreadPool inside debate.py)",
+    )
+    parser.add_argument(
+        "--execute_workers",
+        type=int,
+        default=128,
+        help="Parallel workers for execute.py evaluation",
+    )
+    parser.add_argument(
+        "--max_problems",
+        type=int,
+        default=None,
+        help="Optional cap on number of problems per dataset",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=0.05,
+        help="Relative tolerance for evaluation accuracy comparison",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=90,
+        help="Timeout (seconds) for executing consensus code",
+    )
+    parser.add_argument(
+        "--relative_tolerance",
+        action="store_true",
+        help="Pass --use_relative_tolerance to execute.py",
+    )
+    parser.add_argument(
+        "--save_execution_stdout",
+        action="store_true",
+        help="Store stdout/stderr for consensus executions",
+    )
+    parser.add_argument(
+        "--execute_memory_dir",
+        type=str,
+        default=None,
+        help="Optional memory_storage directory forwarded to execute.py during consensus evaluation.",
+    )
+    parser.add_argument(
+        "--execute_debug_memory_path",
+        type=str,
+        default=None,
+        help="Optional debug_memory.jsonl path forwarded to execute.py during consensus evaluation.",
+    )
+    parser.add_argument(
+        "--execute_disable_debug_memory",
+        action="store_true",
+        help="Pass --disable_debug_memory to execute.py during consensus evaluation.",
+    )
+    parser.add_argument(
+        "--dry_run",
+        action="store_true",
+        help="Only print the planned actions without running debate/eval",
+    )
+    parser.add_argument(
+        "--debate_memory_dir",
+        type=str,
+        default=str(DEFAULT_DEBATE_MEMORY_DIR),
+        help="Directory containing debate memory cases for prompt augmentation",
+    )
+    parser.add_argument(
+        "--debate_memory_top_k",
+        type=int,
+        default=2,
+        help="How many debate memory cases to retrieve per problem",
+    )
+    parser.add_argument(
+        "--disable_debate_memory",
+        action="store_true",
+        help="Skip retrieval even if debate memory directory exists",
+    )
+    parser.add_argument(
+        "--embedding_model",
+        type=str,
+        default=None,
+        help="Embedding model name or local path (default: BAAI/bge-small-en-v1.5). "
+             "Use local path to avoid Hugging Face downloads, or set HF_HUB_OFFLINE=1 environment variable.",
+    )
+    return parser.parse_args()
+def normalize_dataset_list(raw_list: Optional[List[str]]) -> Optional[List[str]]:
+    """Split comma-separated values and strip whitespace."""
+    if not raw_list:
+        return None
+    datasets: List[str] = []
+    for item in raw_list:
+        parts = [part.strip() for part in item.split(",") if part.strip()]
+        datasets.extend(normalize_dataset_name(part) for part in parts)
+    return datasets or None
+def collect_runs(results_dir: str, model: str) -> Dict[str, List[Tuple[str, str]]]:
+    """
+    Return mapping dataset -> list of (timestamp, path) sorted ascending.
+    Skips evaluation artifacts (suffixes containing '_eval').
+    """
+    pattern = os.path.join(results_dir, f"{model}_*.jsonl")
+    regex = re.compile(rf"{re.escape(model)}_(.+)_(\d{{8}}_\d{{6}})\.jsonl$")
+    runs: Dict[str, List[Tuple[str, str]]] = {}
+    for path in glob.glob(pattern):
+        base = os.path.basename(path)
+        match = regex.match(base)
+        if not match:
+            continue
+        dataset = normalize_dataset_name(match.group(1))
+        if "_eval" in dataset:
+            continue
+        timestamp = match.group(2)
+        runs.setdefault(dataset, []).append((timestamp, path))
+    for dataset in runs:
+        runs[dataset].sort()  # chronological
+    return runs
+def pick_latest(runs: Dict[str, List[Tuple[str, str]]], dataset: str) -> Optional[str]:
+    """Return latest file path for dataset if available."""
+    entries = runs.get(dataset)
+    if not entries:
+        return None
+    return entries[-1][1]
+def stream_command(cmd: List[str], cwd: str, log_path: str) -> None:
+    """Run a subprocess, streaming output to stdout and a log file."""
+    print(f"\n▶ Running: {' '.join(cmd)}", flush=True)
+    print(f"   cwd: {cwd}", flush=True)
+    os.makedirs(os.path.dirname(log_path), exist_ok=True)
+    with open(log_path, "w", encoding="utf-8") as log_file:
+        process = subprocess.Popen(
+            cmd,
+            cwd=cwd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            bufsize=1,
+        )
+        assert process.stdout is not None  # for type checkers
+        for line in process.stdout:
+            print(line, end="", flush=True)
+            log_file.write(line)
+            log_file.flush()
+        return_code = process.wait()
+    if return_code != 0:
+        raise subprocess.CalledProcessError(return_code, cmd)
+def load_eval_report(report_path: str) -> Optional[Dict]:
+    if not os.path.exists(report_path):
+        return None
+    with open(report_path, "r", encoding="utf-8") as fh:
+        return json.load(fh)
+def ensure_script(path: str, description: str) -> None:
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"{description} not found: {path}")
+def main() -> None:
+    args = parse_args()
+    args.datasets = normalize_dataset_list(args.datasets)
+    args.output_root = os.path.abspath(args.output_root)
+    args.results_dir = os.path.abspath(args.results_dir)
+    debate_memory_bank: Optional[MemoryBank] = None
+    if not args.disable_debate_memory and args.debate_memory_dir:
+        debate_memory_path = Path(args.debate_memory_dir)
+        if debate_memory_path.exists():
+            try:
+                embedding_model = args.embedding_model if args.embedding_model else "BAAI/bge-small-en-v1.5"
+                debate_memory_bank = MemoryBank(
+                    memory_dir=str(debate_memory_path),
+                    embedding_model=embedding_model
+                )
+            except Exception as exc:  # noqa: BLE001
+                print(f"⚠️  Warning: failed to load debate memory from {debate_memory_path}: {exc}")
+        else:
+            print(f"ℹ️  Debate memory directory not found: {debate_memory_path} (skipping context retrieval)")
+    ensure_script(args.debate_script, "Debate script")
+    ensure_script(args.execute_script, "Execute script")
+    modelA_runs = collect_runs(args.results_dir, args.modelA)
+    modelB_runs = collect_runs(args.results_dir, args.modelB)
+    if args.datasets:
+        datasets = args.datasets
+    else:
+        datasets = sorted(set(modelA_runs.keys()) & set(modelB_runs.keys()))
+    if not datasets:
+        print("❌ No common datasets with available runs were found.")
+        sys.exit(1)
+    print("=" * 80)
+    print("🧠 Memory-Based Debate Runner")
+    print("=" * 80)
+    print(f"Model A: {args.modelA}")
+    print(f"Model B: {args.modelB}")
+    print(f"Datasets: {', '.join(datasets)}")
+    print(f"Results dir: {args.results_dir}")
+    print(f"Output root: {args.output_root}")
+    print(f"Debate workers: {args.debate_workers} (parallel)")
+    print("=" * 80)
+    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
+    dataset_summaries: List[Dict] = []
+    processed = 0
+    for dataset in datasets:
+        file_a = pick_latest(modelA_runs, dataset)
+        file_b = pick_latest(modelB_runs, dataset)
+        if not file_a or not file_b:
+            print(f"⚠️  Skipping {dataset}: missing runs for one of the models.")
+            dataset_summaries.append(
+                {
+                    "dataset": dataset,
+                    "status": "missing_runs",
+                    "modelA_file": file_a,
+                    "modelB_file": file_b,
+                }
+            )
+            continue
+        run_dir = os.path.join(
+            args.output_root,
+            dataset,
+            f"{timestamp}_{args.modelA}_vs_{args.modelB}",
+        )
+        os.makedirs(run_dir, exist_ok=True)
+        print(f"\n{'=' * 80}")
+        print(f"🚀 Dataset: {dataset}")
+        print(f"   Model A file: {file_a}")
+        print(f"   Model B file: {file_b}")
+        print(f"   Output dir: {run_dir}")
+        print(f"{'=' * 80}")
+        file_a_for_debate = file_a
+        file_b_for_debate = file_b
+        if not args.dry_run and debate_memory_bank and args.debate_memory_top_k > 0:
+            contexts = build_debate_memory_contexts(
+                [file_a, file_b], debate_memory_bank, dataset, args.debate_memory_top_k
+            )
+            if any(contexts.values()):
+                print("  🧠 Injecting debate memory context into prompts")
+                enriched_a = os.path.join(
+                    run_dir, f"{os.path.basename(file_a)}.debate_memory.jsonl"
+                )
+                enriched_b = os.path.join(
+                    run_dir, f"{os.path.basename(file_b)}.debate_memory.jsonl"
+                )
+                file_a_for_debate = maybe_enrich_generation_file(file_a, enriched_a, contexts)
+                file_b_for_debate = maybe_enrich_generation_file(file_b, enriched_b, contexts)
+        if args.dry_run:
+            print("Dry-run mode → skipping actual execution.")
+            dataset_summaries.append(
+                {
+                    "dataset": dataset,
+                    "status": "dry_run",
+                    "debate_dir": run_dir,
+                    "modelA_file": file_a,
+                    "modelB_file": file_b,
+                }
+            )
+            continue
+        # 1) Run debate
+        debate_cmd = [
+            sys.executable,
+            "-u",
+            args.debate_script,
+            "--resultA",
+            file_a_for_debate,
+            "--resultB",
+            file_b_for_debate,
+            "--modelA",
+            args.modelA,
+            "--modelB",
+            args.modelB,
+            "--save_dir",
+            run_dir,
+            "--max_rounds",
+            str(args.max_rounds),
+            "--temperature",
+            str(args.temperature),
+            "--num_workers",
+            str(args.debate_workers),
+        ]
+        if args.max_problems is not None:
+            debate_cmd.extend(["--max_problems", str(args.max_problems)])
+        debate_log = os.path.join(run_dir, "debate.log")
+        stream_command(debate_cmd, cwd=str(MONOREPO_ROOT), log_path=debate_log)
+        consensus_file = os.path.join(
+            run_dir, f"consensus_{args.modelA}_vs_{args.modelB}.jsonl"
+        )
+        if not os.path.exists(consensus_file):
+            raise FileNotFoundError(
+                f"Consensus file not found after debate: {consensus_file}"
+            )
+        # 2) Evaluate consensus
+        eval_dir = os.path.join(run_dir, "eval_consensus")
+        eval_cmd = [
+            sys.executable,
+            "-u",
+            args.execute_script,
+            "--input_file",
+            consensus_file,
+            "--output_dir",
+            eval_dir,
+            "--timeout",
+            str(args.timeout),
+            "--tolerance",
+            str(args.tolerance),
+            "--num_workers",
+            str(args.execute_workers),
+        ]
+        if args.relative_tolerance:
+            eval_cmd.append("--use_relative_tolerance")
+        if args.save_execution_stdout:
+            eval_cmd.append("--save_output")
+        if args.execute_memory_dir:
+            eval_cmd.extend(["--memory_dir", args.execute_memory_dir])
+        if args.execute_debug_memory_path:
+            eval_cmd.extend(["--debug_memory_path", args.execute_debug_memory_path])
+        if args.execute_disable_debug_memory:
+            eval_cmd.append("--disable_debug_memory")
+        if args.embedding_model:
+            eval_cmd.extend(["--embedding_model", args.embedding_model])
+        eval_log = os.path.join(run_dir, "evaluate.log")
+        stream_command(eval_cmd, cwd=str(PROJECT_ROOT), log_path=eval_log)
+        report_path = os.path.join(eval_dir, "evaluation_report.json")
+        report = load_eval_report(report_path)
+        if report is None:
+            raise FileNotFoundError(f"Missing evaluation report: {report_path}")
+        dataset_summaries.append(
+            {
+                "dataset": dataset,
+                "status": "completed",
+                "debate_dir": run_dir,
+                "accuracy": report.get("accuracy"),
+                "correct": report.get("correct"),
+                "total": report.get("total_problems"),
+                "report_path": report_path,
+            }
+        )
+        processed += 1
+    print("\n" + "=" * 80)
+    print("📊 Debate + Evaluation Summary")
+    print("=" * 80)
+    for item in dataset_summaries:
+        dataset = item["dataset"]
+        status = item["status"]
+        if status == "completed":
+            accuracy = item.get("accuracy")
+            correct = item.get("correct")
+            total = item.get("total")
+            print(
+                f"{dataset:25s} → accuracy {accuracy:.2%} ({correct}/{total}) | dir: {item['debate_dir']}"
+            )
+        elif status == "dry_run":
+            print(f"{dataset:25s} → dry run (planned dir: {item['debate_dir']})")
+        else:
+            print(f"{dataset:25s} → {status} (A={item.get('modelA_file')}, B={item.get('modelB_file')})")
+    print("=" * 80)
+    if not args.dry_run and processed == 0:
+        sys.exit("No datasets were processed successfully.")
+if __name__ == "__main__":
+    main()