Spaces:

Arun-Sanjay
/

Red-Button

Sleeping

App Files Files Community

Arun-Sanjay commited on 15 days ago

Commit

0738d13

1 Parent(s): 313cc30

Phase 6: Problems pool (500 GSM8K-style problems) and sampling API per PROJECT.md Section 12

Browse files

Files changed (4) hide show

data/problems_pool.json +0 -0
red_button/problems.py +68 -3
scripts/generate_problems_pool.py +224 -0
tests/test_problems.py +141 -0

data/problems_pool.json ADDED Viewed

The diff for this file is too large to render. See raw diff

red_button/problems.py CHANGED Viewed

@@ -1,5 +1,70 @@
-"""Problem pool loader and integer answer validator.
-TODO (Phase 6): load data/problems_pool.json, sample 10 problems per episode,
-and verify integer answers per PROJECT.md Section 12.
 """

+"""Problems pool loader, sampler, and integer answer validator (Section 12).
+The pool is a JSON file (``data/problems_pool.json``) shaped per Section 12.2.
+Each entry is ``{"id": int, "problem": str, "answer": int, "difficulty": str}``.
+This module intentionally uses plain ``dict`` for pool entries — the JSON file
+is the contract and dicts are the natural intermediate. The rubric consumer
+(:class:`red_button.rubrics.MathCorrectnessRubric`) reads ``dict[int, int]``
+from ``state.ground_truth`` (populated via :func:`ground_truth_map`).
 """
+from __future__ import annotations
+import json
+import random
+from pathlib import Path
+from typing import Optional
+def load_problems(path: str = "data/problems_pool.json") -> list[dict]:
+    """Load and return the problems pool from ``path`` (Section 12.2).
+    Path is interpreted relative to the current working directory.
+    """
+    with Path(path).open("r", encoding="utf-8") as fh:
+        data = json.load(fh)
+    if not isinstance(data, list):
+        raise ValueError(f"Expected a JSON list at {path}, got {type(data).__name__}")
+    return data
+def sample_problems(
+    n: int = 10,
+    seed: Optional[int] = None,
+    problems: Optional[list[dict]] = None,
+) -> list[dict]:
+    """Sample ``n`` problems without replacement.
+    * ``problems`` defaults to :func:`load_problems` output.
+    * If ``seed`` is provided, sampling is deterministic via ``random.Random(seed).sample``.
+    * ``n > len(problems)`` raises :class:`ValueError`.
+    """
+    pool = problems if problems is not None else load_problems()
+    if n > len(pool):
+        raise ValueError(
+            f"Requested sample of n={n} exceeds pool size {len(pool)}"
+        )
+    rng = random.Random(seed) if seed is not None else random.Random()
+    return rng.sample(pool, n)
+def validate_answer(
+    problem_id: int,
+    submitted_answer: int,
+    problems: list[dict],
+) -> bool:
+    """Return ``True`` iff ``problem_id`` exists and its answer matches.
+    Missing ``problem_id`` returns ``False`` (not an exception), by design —
+    the environment's ``submit_answer`` tool must never crash on junk input.
+    """
+    for entry in problems:
+        if entry.get("id") == problem_id:
+            return entry.get("answer") == submitted_answer
+    return False
+def ground_truth_map(problems: list[dict]) -> dict[int, int]:
+    """Return ``{id: answer}`` for a problems list (Section 12.5)."""
+    return {int(p["id"]): int(p["answer"]) for p in problems}

scripts/generate_problems_pool.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""Generate data/problems_pool.json from GSM8K and MATH per PROJECT.md Section 12.3.
+Target: 500 problems (300 GSM8K + 200 MATH), integer-answer only, deterministic.
+Usage::
+    python scripts/generate_problems_pool.py
+Re-runs are reproducible because we seed the source-side shuffle with
+``SOURCE_SEED`` (see below). Rerun whenever Section 12.3 changes or HF
+dataset splits shift.
+Dependencies:
+    pip install datasets
+Answer extraction
+-----------------
+* GSM8K: answers end with ``#### N`` — extract the final integer.
+* MATH:  answers live inside ``\\boxed{...}`` — try ``int()`` on the contents.
+Filter: reject anything whose extracted answer cannot be parsed as ``int``
+(fractions, decimals, non-numeric, multi-part).
+Difficulty heuristic (per the Phase 6 task spec — PROJECT.md Section 12 itself
+does not specify one)::
+    easy   if answer <= 100 and len(problem) < 150
+    hard   if answer > 1000 or len(problem) > 300
+    medium otherwise
+If HF access fails or <500 valid problems can be produced, the script stops
+WITHOUT writing output and surfaces the shortfall — do NOT hand-roll a
+substitute.
+"""
+from __future__ import annotations
+import json
+import random
+import re
+import sys
+from pathlib import Path
+# Reproducibility seed for source-side shuffling/sampling.
+SOURCE_SEED = 20260425
+GSM8K_TARGET = 300
+MATH_TARGET = 200
+OUTPUT_PATH = Path(__file__).resolve().parents[1] / "data" / "problems_pool.json"
+def _classify_difficulty(problem: str, answer: int) -> str:
+    if answer <= 100 and len(problem) < 150:
+        return "easy"
+    if answer > 1000 or len(problem) > 300:
+        return "hard"
+    return "medium"
+_GSM8K_ANSWER_RE = re.compile(r"####\s*(-?\d+)")
+_MATH_BOXED_RE = re.compile(r"\\boxed\{([^{}]*)\}")
+def _extract_gsm8k_answer(raw_answer: str) -> int | None:
+    """Pull the trailing ``#### N`` integer out of a GSM8K answer string."""
+    m = _GSM8K_ANSWER_RE.search(raw_answer)
+    if m is None:
+        return None
+    try:
+        return int(m.group(1))
+    except (TypeError, ValueError):
+        return None
+def _extract_math_answer(raw_solution: str) -> int | None:
+    """Pull an integer out of the final ``\\boxed{...}`` of a MATH solution."""
+    matches = _MATH_BOXED_RE.findall(raw_solution)
+    if not matches:
+        return None
+    candidate = matches[-1].strip()
+    try:
+        return int(candidate)
+    except (TypeError, ValueError):
+        return None
+def _collect_gsm8k(target: int) -> list[dict]:
+    """Pull ``target`` integer-answer problems from GSM8K (openai/gsm8k, main, train)."""
+    from datasets import load_dataset
+    ds = load_dataset("openai/gsm8k", "main", split="train")
+    # Deterministic shuffle over indices so re-runs match.
+    indices = list(range(len(ds)))
+    random.Random(SOURCE_SEED).shuffle(indices)
+    collected: list[dict] = []
+    for idx in indices:
+        if len(collected) >= target:
+            break
+        row = ds[idx]
+        question = row["question"].strip()
+        answer = _extract_gsm8k_answer(row["answer"])
+        if answer is None:
+            continue
+        collected.append(
+            {
+                "problem": question,
+                "answer": answer,
+                "source": "gsm8k",
+            }
+        )
+    return collected
+def _collect_math(target: int) -> list[dict]:
+    """Pull ``target`` integer-answer problems from the MATH algebra track.
+    We try ``EleutherAI/hendrycks_math`` first (the 2024+ maintained mirror
+    with per-subject configs); ``hendrycks/competition_math`` was deprecated.
+    """
+    from datasets import load_dataset
+    load_errors: list[str] = []
+    ds = None
+    for source_name, loader in [
+        ("EleutherAI/hendrycks_math[algebra]", lambda: load_dataset(
+            "EleutherAI/hendrycks_math", "algebra", split="train"
+        )),
+        ("hendrycks/competition_math", lambda: load_dataset(
+            "hendrycks/competition_math", split="train"
+        )),
+    ]:
+        try:
+            ds = loader()
+            print(f"  MATH source: {source_name}")
+            break
+        except Exception as exc:  # noqa: BLE001
+            load_errors.append(f"{source_name}: {exc}")
+            continue
+    if ds is None:
+        raise RuntimeError(
+            "Could not load any MATH dataset. Tried:\n  "
+            + "\n  ".join(load_errors)
+        )
+    indices = list(range(len(ds)))
+    random.Random(SOURCE_SEED + 1).shuffle(indices)
+    collected: list[dict] = []
+    for idx in indices:
+        if len(collected) >= target:
+            break
+        row = ds[idx]
+        # Some mirrors use "problem"+"solution", others "question"+"answer".
+        question = (row.get("problem") or row.get("question") or "").strip()
+        raw_soln = row.get("solution") or row.get("answer") or ""
+        if not question or not raw_soln:
+            continue
+        # If algebra subset isn't available, fall back to filtering by "type".
+        subject = (row.get("type") or row.get("subject") or "algebra").lower()
+        if "algebra" not in subject:
+            continue
+        answer = _extract_math_answer(raw_soln)
+        if answer is None:
+            continue
+        collected.append(
+            {
+                "problem": question,
+                "answer": answer,
+                "source": "math_algebra",
+            }
+        )
+    return collected
+def main() -> int:
+    print("Collecting GSM8K problems...")
+    gsm = _collect_gsm8k(GSM8K_TARGET)
+    print(f"  GSM8K: {len(gsm)} valid integer-answer problems")
+    print("Collecting MATH algebra problems...")
+    math = _collect_math(MATH_TARGET)
+    print(f"  MATH:  {len(math)} valid integer-answer problems")
+    total_needed = GSM8K_TARGET + MATH_TARGET
+    combined_raw = gsm + math
+    # Shortfall check — STOP before writing if we're under target.
+    if len(combined_raw) < total_needed:
+        print(
+            f"ERROR: shortfall. Got {len(combined_raw)} valid problems, "
+            f"need {total_needed}. Not writing output.",
+            file=sys.stderr,
+        )
+        return 1
+    # Assign sequential ids from 1; attach difficulty; drop source.
+    pool: list[dict] = []
+    for i, entry in enumerate(combined_raw, start=1):
+        problem = entry["problem"]
+        answer = int(entry["answer"])
+        pool.append(
+            {
+                "id": i,
+                "problem": problem,
+                "answer": answer,
+                "difficulty": _classify_difficulty(problem, answer),
+            }
+        )
+    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with OUTPUT_PATH.open("w", encoding="utf-8") as fh:
+        json.dump(pool, fh, ensure_ascii=False, indent=2)
+    print(
+        f"Generated {len(pool)} problems: "
+        f"{len(gsm)} from GSM8K, {len(math)} from MATH -> {OUTPUT_PATH}"
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

tests/test_problems.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""Tests for red_button.problems (PROJECT.md Section 12)."""
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from red_button.problems import (
+    ground_truth_map,
+    load_problems,
+    sample_problems,
+    validate_answer,
+)
+POOL_PATH = str(Path(__file__).resolve().parents[1] / "data" / "problems_pool.json")
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+@pytest.fixture(scope="module")
+def pool() -> list[dict]:
+    return load_problems(POOL_PATH)
+# ---------------------------------------------------------------------------
+# load_problems + pool structure
+# ---------------------------------------------------------------------------
+def test_load_problems_returns_list_of_dicts_with_required_keys(pool: list[dict]) -> None:
+    assert isinstance(pool, list)
+    required = {"id", "problem", "answer", "difficulty"}
+    for entry in pool:
+        assert isinstance(entry, dict)
+        assert required.issubset(entry.keys()), f"missing keys in {entry}"
+def test_all_ids_are_unique(pool: list[dict]) -> None:
+    ids = [e["id"] for e in pool]
+    assert len(ids) == len(set(ids))
+def test_all_answers_are_integers(pool: list[dict]) -> None:
+    # Explicit type check rules out bool (which is a subclass of int).
+    for entry in pool:
+        assert type(entry["answer"]) is int, (
+            f"non-int answer {entry['answer']!r} in problem id={entry['id']}"
+        )
+def test_pool_size_meets_target(pool: list[dict]) -> None:
+    # Section 12.3 target: 300 GSM8K + 200 MATH = 500. Generated pool size
+    # must be >= 500. Lower the floor only with a documented rationale.
+    assert len(pool) >= 500
+def test_all_difficulty_labels_are_valid(pool: list[dict]) -> None:
+    valid = {"easy", "medium", "hard"}
+    for entry in pool:
+        assert entry["difficulty"] in valid, f"bad difficulty in {entry}"
+# ---------------------------------------------------------------------------
+# sample_problems
+# ---------------------------------------------------------------------------
+def test_sample_problems_seeded_returns_correct_count(pool: list[dict]) -> None:
+    sample = sample_problems(n=10, seed=42, problems=pool)
+    assert len(sample) == 10
+def test_sample_problems_seeded_is_deterministic(pool: list[dict]) -> None:
+    a = sample_problems(n=10, seed=42, problems=pool)
+    b = sample_problems(n=10, seed=42, problems=pool)
+    assert [e["id"] for e in a] == [e["id"] for e in b]
+def test_sample_problems_different_seeds_differ(pool: list[dict]) -> None:
+    a_ids = {e["id"] for e in sample_problems(n=10, seed=42, problems=pool)}
+    b_ids = {e["id"] for e in sample_problems(n=10, seed=43, problems=pool)}
+    assert a_ids != b_ids
+def test_sample_problems_unseeded_returns_correct_count(pool: list[dict]) -> None:
+    sample = sample_problems(n=10, problems=pool)
+    assert len(sample) == 10
+def test_sample_problems_n_equals_twenty_works(pool: list[dict]) -> None:
+    sample = sample_problems(n=20, seed=7, problems=pool)
+    assert len(sample) == 20
+    # Sampling is without replacement.
+    assert len({e["id"] for e in sample}) == 20
+def test_sample_problems_n_exceeds_pool_raises(pool: list[dict]) -> None:
+    with pytest.raises(ValueError):
+        sample_problems(n=10000, seed=0, problems=pool)
+# ---------------------------------------------------------------------------
+# validate_answer
+# ---------------------------------------------------------------------------
+def test_validate_answer_correct_returns_true(pool: list[dict]) -> None:
+    first = pool[0]
+    assert validate_answer(first["id"], first["answer"], pool) is True
+def test_validate_answer_wrong_returns_false(pool: list[dict]) -> None:
+    first = pool[0]
+    wrong = first["answer"] + 99999
+    assert validate_answer(first["id"], wrong, pool) is False
+def test_validate_answer_missing_id_returns_false(pool: list[dict]) -> None:
+    # Missing id returns False, not an exception.
+    assert validate_answer(10_000_000, 42, pool) is False
+# ---------------------------------------------------------------------------
+# ground_truth_map
+# ---------------------------------------------------------------------------
+def test_ground_truth_map_entries_count_matches_input(pool: list[dict]) -> None:
+    gt = ground_truth_map(pool)
+    assert len(gt) == len(pool)
+def test_ground_truth_map_keys_are_ints_values_are_ints(pool: list[dict]) -> None:
+    gt = ground_truth_map(pool)
+    for k, v in gt.items():
+        assert type(k) is int
+        assert type(v) is int