add files
Browse files- Dockerfile +12 -0
- README.md +163 -0
- environment.py +158 -0
- inference.py +242 -0
- main.py +109 -0
- models.py +119 -0
- openenv.yaml +109 -0
- requirements-server.txt +3 -0
- requirements.txt +2 -0
- rewards.py +396 -0
- tasks.py +448 -0
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements-server.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements-server.txt
|
| 7 |
+
|
| 8 |
+
COPY models.py tasks.py rewards.py environment.py main.py openenv.yaml ./
|
| 9 |
+
|
| 10 |
+
EXPOSE 8000
|
| 11 |
+
|
| 12 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# IndicScriptureQA — OpenEnv Environment
|
| 2 |
+
|
| 3 |
+
**Semantic structure and factual grounding evaluation for low-resource Indic languages.**
|
| 4 |
+
|
| 5 |
+
Most LLM benchmarks for Hindi, Sanskrit, and other Indic languages test surface-level factual recall — did the model get the right answer? This environment goes further. It evaluates whether an agent can produce answers that are not only **factually correct** but also **semantically well-formed**: logically ordered, terminologically precise, structurally complete, and coherently written — the qualities that separate a genuinely useful answer from one that merely contains the right words in the wrong shape.
|
| 6 |
+
|
| 7 |
+
The domain is Indic scriptural knowledge (Vedas, Upanishads, Ramayana, Mahabharata, Bhagavad Gita, Puranas), chosen because it stresses every axis at once: factual precision matters (misattributing a verse to the wrong text is a hallucination), but so does structural literacy — knowing that an explanation of Rta should distinguish its natural-law and moral-law dimensions, that the Samudra Manthan narrative has a specific dramatic arc, or that "nishkama karma" is the correct term, not the English gloss "selfless action."
|
| 8 |
+
|
| 9 |
+
## The problem with low-resource language evaluation
|
| 10 |
+
|
| 11 |
+
LLMs fail on low-resource languages in ways that pure accuracy metrics miss:
|
| 12 |
+
|
| 13 |
+
**Terminology collapse.** Models substitute English glosses for domain-specific terms — writing "cosmic order" instead of "Rta", "meditation" instead of "dhyana", "duty" instead of "svadharma." This strips cultural and semantic precision even when the underlying fact is technically correct.
|
| 14 |
+
|
| 15 |
+
**Structural incoherence.** Answers about complex topics arrive as bags of loosely related facts instead of logically sequenced arguments. An explanation of the six Darshanas that jumbles founders with commentators, or a Dashavatara account that breaks chronological ordering, fails structurally even if every individual claim is true.
|
| 16 |
+
|
| 17 |
+
**Completeness gaps.** Models cover one dimension of a multi-faceted concept and call it done — describing dharma only as "duty" without addressing its subtlety (sukshma), context-dependence, or the rajadharma/apaddharma/moksha-dharma triad that the Mahabharata actually teaches.
|
| 18 |
+
|
| 19 |
+
**Misconception propagation.** Some errors are so common in training data that models reproduce them confidently — Shankaracharya "founding" Vedanta (he was a commentator, not the founder), or Indra "maintaining" Rta (that's Varuna). These need active detection and penalisation, not just factual comparison.
|
| 20 |
+
|
| 21 |
+
This environment provides a structured RL benchmark for training and evaluating agents that address **all four failure modes simultaneously**.
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## How it works
|
| 26 |
+
|
| 27 |
+
An agent receives a question and a pre-generated answer that may be flawed along any combination of axes — factual errors, poor structure, missing terminology, wrong ordering, incomplete coverage. The agent interacts with the environment through a fixed action space to iteratively improve the answer before finalising it.
|
| 28 |
+
|
| 29 |
+
## Action Space
|
| 30 |
+
|
| 31 |
+
| Action | Payload | Effect |
|
| 32 |
+
|---------------|-------------------------------------------|-------------------------------------------------------|
|
| 33 |
+
| `RETRIEVE` | Optional query string | Surfaces the next available source passage |
|
| 34 |
+
| `EDIT` | New answer text | Rewrites to fix factual errors and improve content |
|
| 35 |
+
| `RESTRUCTURE` | Reorganised answer text | Reorganises flow, ordering, and terminology without changing facts |
|
| 36 |
+
| `CITE` | Citation string (e.g. `"Bhagavad Gita 2.47"`) | Attaches a citation |
|
| 37 |
+
| `ACCEPT` | — | Accepts answer as final (terminal) |
|
| 38 |
+
| `REJECT` | — | Rejects the answer entirely (terminal) |
|
| 39 |
+
|
| 40 |
+
The distinction between `EDIT` and `RESTRUCTURE` is deliberate. EDIT changes what the answer says. RESTRUCTURE changes how it says it — reordering paragraphs, inserting transitions, swapping an English gloss for the correct Sanskrit term, expanding a single sentence into the three conceptual aspects the topic requires. The grader scores them differently: RESTRUCTURE is penalised if it destroys factual content, and EDIT is measured on both factual and structural improvement.
|
| 41 |
+
|
| 42 |
+
## Observation Space
|
| 43 |
+
|
| 44 |
+
| Field | Type | Description |
|
| 45 |
+
|----------------------|--------------|-------------------------------------------------------------|
|
| 46 |
+
| `question` | `str` | The question being answered |
|
| 47 |
+
| `current_answer` | `str` | Current (possibly flawed) answer |
|
| 48 |
+
| `retrieved_passages` | `list[str]` | Source passages retrieved so far |
|
| 49 |
+
| `current_citations` | `list[str]` | Citations attached so far |
|
| 50 |
+
| `steps_remaining` | `int` | Steps left in the episode |
|
| 51 |
+
| `task_name` | `str` | Active task identifier |
|
| 52 |
+
| `feedback` | `str?` | Feedback from the last action (includes structural breakdown) |
|
| 53 |
+
| `structural_hints` | `list[str]` | Non-spoiler hints about expected answer structure |
|
| 54 |
+
|
| 55 |
+
`structural_hints` are the agent's window into what the grader expects structurally — things like "Use the Sanskrit term for selfless action", "Cover scriptural, ritual, AND mathematical dimensions", or "Follow narrative arc: setup → churning → crisis → treasures → resolution." They don't reveal the answer but guide the agent toward well-formed output.
|
| 56 |
+
|
| 57 |
+
## Tasks
|
| 58 |
+
|
| 59 |
+
| Task | Difficulty | Max Steps | Focus |
|
| 60 |
+
|-----------------------|------------|-----------|-------|
|
| 61 |
+
| `verify-factual` | Easy | 5 | Can the agent distinguish a correct answer from a wrong one, accounting for both factual accuracy and structural adequacy? |
|
| 62 |
+
| `correct-and-cite` | Medium | 8 | Given a partially correct answer with missing citations and poor structure, can the agent retrieve sources, fix gaps, add terminology, and reorganise? |
|
| 63 |
+
| `fix-hallucination` | Hard | 12 | Can the agent detect subtle hallucinations woven into plausible text while simultaneously fixing structural problems: wrong concept ordering, banned misconception terms, incomplete aspect coverage? |
|
| 64 |
+
|
| 65 |
+
Each task has 5 scenarios covering the Vedas, Upanishads, Ramayana, Mahabharata, Bhagavad Gita, and Puranas. Every scenario carries both factual ground truth and a `StructuralMeta` specification defining required terms, required sections, expected ordering, and banned misconception markers.
|
| 66 |
+
|
| 67 |
+
## Reward Structure
|
| 68 |
+
|
| 69 |
+
The final score blends **factual quality** and **structural quality** into **[0.0, 1.0]**.
|
| 70 |
+
|
| 71 |
+
### Terminal reward (on ACCEPT)
|
| 72 |
+
|
| 73 |
+
| Component | Max | What it measures |
|
| 74 |
+
|---------------------|-------|------------------------------------------------------|
|
| 75 |
+
| Factual similarity | 0.90 | Token-F1 between final answer and ground truth |
|
| 76 |
+
| Citation recall | 0.30 | Fraction of expected citations matched |
|
| 77 |
+
| Structural quality | 0.70 | Composite of 4 axes (see below) |
|
| 78 |
+
| Efficiency bonus | 0.20 | Reward for finishing in fewer steps |
|
| 79 |
+
|
| 80 |
+
### Structural quality composite (0.70 max)
|
| 81 |
+
|
| 82 |
+
| Axis | Weight | What it catches |
|
| 83 |
+
|------------------|--------|----------------------------------------------------------------|
|
| 84 |
+
| **Terminology** | 0.30 | Are the correct Sanskrit/domain terms present? Are banned misconception markers absent? |
|
| 85 |
+
| **Completeness** | 0.25 | Does the answer cover all required conceptual aspects of the topic? |
|
| 86 |
+
| **Ordering** | 0.25 | Do concepts appear in the expected logical/narrative sequence? |
|
| 87 |
+
| **Coherence** | 0.20 | Transition quality, sentence-structure uniformity, multi-sentence flow |
|
| 88 |
+
|
| 89 |
+
All four axes are computed without ML dependencies — token matching, keyword heuristics, positional analysis, and discourse marker detection — so the environment runs on minimal hardware (2 vCPU, 8 GB RAM).
|
| 90 |
+
|
| 91 |
+
### Per-step shaping
|
| 92 |
+
|
| 93 |
+
| Action | Good outcome | Bad outcome |
|
| 94 |
+
|--------------|-----------------------|--------------------------------|
|
| 95 |
+
| `RETRIEVE` | +0.05 (useful) | −0.15 (redundant, >3 times) |
|
| 96 |
+
| `EDIT` | +0.20 + quality delta | −0.20 (degradation) |
|
| 97 |
+
| `RESTRUCTURE`| +0.25 + struct delta | −0.25 (destroyed facts) |
|
| 98 |
+
| `CITE` | +0.15 (correct) | −0.05 (wrong) |
|
| 99 |
+
|
| 100 |
+
Step-level rewards blend factual and structural deltas (60/40 for EDIT, structure-dominant for RESTRUCTURE), giving the agent continuous signal throughout the episode rather than only at termination.
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## Setup
|
| 105 |
+
|
| 106 |
+
### Server (Docker)
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
docker build -t indic-scripture-qa .
|
| 110 |
+
docker run -p 8000:8000 indic-scripture-qa
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
Verify: `curl -X POST http://localhost:8000/reset -H 'Content-Type: application/json' -d '{}'`
|
| 114 |
+
|
| 115 |
+
### Inference
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
pip install -r requirements.txt
|
| 119 |
+
|
| 120 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 121 |
+
export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
|
| 122 |
+
export HF_TOKEN="your-token"
|
| 123 |
+
export PING_URL="http://localhost:8000"
|
| 124 |
+
|
| 125 |
+
python inference.py
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
### Validate
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
pip install openenv-core
|
| 132 |
+
openenv validate
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
## Baseline Scores
|
| 136 |
+
|
| 137 |
+
| Task | Score |
|
| 138 |
+
|----------------------|--------|
|
| 139 |
+
| `verify-factual` | ~0.40 |
|
| 140 |
+
| `correct-and-cite` | ~0.30 |
|
| 141 |
+
| `fix-hallucination` | ~0.22 |
|
| 142 |
+
| **Average** | ~0.31 |
|
| 143 |
+
|
| 144 |
+
*(Qwen2.5-72B-Instruct, temperature=0.4, scenario 0, structural eval enabled)*
|
| 145 |
+
|
| 146 |
+
## Project Structure
|
| 147 |
+
|
| 148 |
+
```
|
| 149 |
+
├── openenv.yaml # OpenEnv metadata
|
| 150 |
+
├── Dockerfile # Server container
|
| 151 |
+
├── main.py # FastAPI server (reset/step/state)
|
| 152 |
+
├── environment.py # Core env logic
|
| 153 |
+
├── models.py # Typed Pydantic models + StructuralMeta
|
| 154 |
+
├── tasks.py # Task definitions, scenarios, structural metadata
|
| 155 |
+
├── rewards.py # Factual + structural reward computation
|
| 156 |
+
├── inference.py # Baseline inference script
|
| 157 |
+
├── requirements.txt # Client deps
|
| 158 |
+
└── requirements-server.txt # Server deps
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
## License
|
| 162 |
+
|
| 163 |
+
MIT
|
environment.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Core environment logic for IndicScriptureQA.
|
| 3 |
+
|
| 4 |
+
Implements the OpenEnv interface:
|
| 5 |
+
reset(task_name, scenario_index) → StepResult
|
| 6 |
+
step(action) → StepResult
|
| 7 |
+
state() → EnvState
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import random
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
from models import Action, ActionType, EnvState, Observation, StepResult, StructuralMeta
|
| 16 |
+
from rewards import normalize_score, step_reward, terminal_reward
|
| 17 |
+
from tasks import TASKS, Scenario, TaskConfig
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class IndicScriptureQAEnv:
|
| 21 |
+
"""Stateful environment — one instance per episode."""
|
| 22 |
+
|
| 23 |
+
def __init__(self) -> None:
|
| 24 |
+
self._state: Optional[EnvState] = None
|
| 25 |
+
|
| 26 |
+
# ── reset ─────────────────────────────────────────────────────────────
|
| 27 |
+
|
| 28 |
+
def reset(
|
| 29 |
+
self,
|
| 30 |
+
task_name: str = "verify-factual",
|
| 31 |
+
scenario_index: Optional[int] = None,
|
| 32 |
+
) -> StepResult:
|
| 33 |
+
if task_name not in TASKS:
|
| 34 |
+
raise ValueError(f"Unknown task {task_name!r}. Choose from {list(TASKS)}")
|
| 35 |
+
|
| 36 |
+
cfg: TaskConfig = TASKS[task_name]
|
| 37 |
+
if scenario_index is not None:
|
| 38 |
+
idx = scenario_index % len(cfg.scenarios)
|
| 39 |
+
else:
|
| 40 |
+
idx = random.randint(0, len(cfg.scenarios) - 1)
|
| 41 |
+
|
| 42 |
+
sc: Scenario = cfg.scenarios[idx]
|
| 43 |
+
|
| 44 |
+
self._state = EnvState(
|
| 45 |
+
question=sc.question,
|
| 46 |
+
current_answer=sc.given_answer,
|
| 47 |
+
original_answer=sc.given_answer,
|
| 48 |
+
ground_truth_answer=sc.ground_truth_answer,
|
| 49 |
+
ground_truth_citations=list(sc.ground_truth_citations),
|
| 50 |
+
available_passages=list(sc.available_passages),
|
| 51 |
+
answer_is_correct=sc.answer_is_correct,
|
| 52 |
+
factual_is_correct=sc.factual_is_correct,
|
| 53 |
+
structural_meta=sc.structural_meta,
|
| 54 |
+
structural_hints=list(sc.structural_hints),
|
| 55 |
+
task_name=task_name,
|
| 56 |
+
max_steps=cfg.max_steps,
|
| 57 |
+
steps_remaining=cfg.max_steps,
|
| 58 |
+
step_count=0,
|
| 59 |
+
done=False,
|
| 60 |
+
cumulative_reward=0.0,
|
| 61 |
+
rewards=[],
|
| 62 |
+
retrieval_count=0,
|
| 63 |
+
edit_count=0,
|
| 64 |
+
restructure_count=0,
|
| 65 |
+
feedback="Episode started. Examine the answer for factual accuracy AND semantic structure.",
|
| 66 |
+
)
|
| 67 |
+
return StepResult(observation=self._state.to_observation(), reward=0.0, done=False)
|
| 68 |
+
|
| 69 |
+
# ── step ──────────────────────────────────────────────────────────────
|
| 70 |
+
|
| 71 |
+
def step(self, action: Action) -> StepResult:
|
| 72 |
+
s = self._state
|
| 73 |
+
if s is None:
|
| 74 |
+
raise RuntimeError("Call reset() before step().")
|
| 75 |
+
if s.done:
|
| 76 |
+
raise RuntimeError("Episode already finished. Call reset().")
|
| 77 |
+
|
| 78 |
+
s.step_count += 1
|
| 79 |
+
s.steps_remaining -= 1
|
| 80 |
+
act = action.action_type
|
| 81 |
+
payload = (action.payload or "").strip()
|
| 82 |
+
|
| 83 |
+
reward = 0.0
|
| 84 |
+
feedback = ""
|
| 85 |
+
done = False
|
| 86 |
+
|
| 87 |
+
# ── action dispatch ───────────────────────────────────────────────
|
| 88 |
+
if act == ActionType.RETRIEVE:
|
| 89 |
+
s.retrieval_count += 1
|
| 90 |
+
if s.available_passages:
|
| 91 |
+
idx = (s.retrieval_count - 1) % len(s.available_passages)
|
| 92 |
+
passage = s.available_passages[idx]
|
| 93 |
+
if passage not in s.retrieved_passages:
|
| 94 |
+
s.retrieved_passages.append(passage)
|
| 95 |
+
reward, feedback = step_reward(s, act, payload)
|
| 96 |
+
|
| 97 |
+
elif act == ActionType.EDIT:
|
| 98 |
+
s.edit_count += 1
|
| 99 |
+
reward, feedback = step_reward(s, act, payload)
|
| 100 |
+
if payload:
|
| 101 |
+
s.current_answer = payload
|
| 102 |
+
|
| 103 |
+
elif act == ActionType.RESTRUCTURE:
|
| 104 |
+
s.restructure_count += 1
|
| 105 |
+
reward, feedback = step_reward(s, act, payload)
|
| 106 |
+
if payload:
|
| 107 |
+
s.current_answer = payload
|
| 108 |
+
|
| 109 |
+
elif act == ActionType.CITE:
|
| 110 |
+
if payload and payload not in s.current_citations:
|
| 111 |
+
s.current_citations.append(payload)
|
| 112 |
+
reward, feedback = step_reward(s, act, payload)
|
| 113 |
+
|
| 114 |
+
elif act == ActionType.ACCEPT:
|
| 115 |
+
t_reward, feedback = terminal_reward(s, act)
|
| 116 |
+
reward = t_reward
|
| 117 |
+
done = True
|
| 118 |
+
|
| 119 |
+
elif act == ActionType.REJECT:
|
| 120 |
+
t_reward, feedback = terminal_reward(s, act)
|
| 121 |
+
reward = t_reward
|
| 122 |
+
done = True
|
| 123 |
+
|
| 124 |
+
else:
|
| 125 |
+
reward = -0.10
|
| 126 |
+
feedback = f"Unknown action type: {act}"
|
| 127 |
+
|
| 128 |
+
# ── check step limit ──────────────────────────────────────────────
|
| 129 |
+
if not done and s.steps_remaining <= 0:
|
| 130 |
+
t_reward, t_fb = terminal_reward(s, ActionType.ACCEPT)
|
| 131 |
+
reward += t_reward - 0.20
|
| 132 |
+
feedback += f" | Forced termination (step limit). {t_fb}"
|
| 133 |
+
done = True
|
| 134 |
+
|
| 135 |
+
# ── bookkeeping ──────────────────────────────────────────────────
|
| 136 |
+
s.rewards.append(reward)
|
| 137 |
+
s.cumulative_reward += reward
|
| 138 |
+
s.done = done
|
| 139 |
+
s.feedback = feedback
|
| 140 |
+
|
| 141 |
+
info = {}
|
| 142 |
+
if done:
|
| 143 |
+
info["score"] = normalize_score(s.cumulative_reward)
|
| 144 |
+
info["cumulative_reward"] = s.cumulative_reward
|
| 145 |
+
|
| 146 |
+
return StepResult(
|
| 147 |
+
observation=s.to_observation(),
|
| 148 |
+
reward=reward,
|
| 149 |
+
done=done,
|
| 150 |
+
info=info,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# ── state ─────────────────────────────────────────────────────────────
|
| 154 |
+
|
| 155 |
+
def state(self) -> EnvState:
|
| 156 |
+
if self._state is None:
|
| 157 |
+
raise RuntimeError("Call reset() first.")
|
| 158 |
+
return self._state.model_copy(deep=True)
|
inference.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline inference script for IndicScriptureQA.
|
| 3 |
+
|
| 4 |
+
Runs an LLM agent against all 3 tasks via the OpenEnv HTTP API.
|
| 5 |
+
Emits structured [START]/[STEP]/[END] logs per the OpenEnv spec.
|
| 6 |
+
|
| 7 |
+
The agent evaluates BOTH factual accuracy AND semantic structure:
|
| 8 |
+
- factual: hallucination detection, correction
|
| 9 |
+
- structural: coherence, completeness, terminology, logical ordering
|
| 10 |
+
|
| 11 |
+
Environment variables:
|
| 12 |
+
API_BASE_URL LLM endpoint (default: HF router)
|
| 13 |
+
MODEL_NAME Model identifier (default: Qwen2.5-72B-Instruct)
|
| 14 |
+
HF_TOKEN API key
|
| 15 |
+
PING_URL Environment server (default: http://localhost:8000)
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import json
|
| 19 |
+
import os
|
| 20 |
+
import textwrap
|
| 21 |
+
from typing import Dict, List, Optional
|
| 22 |
+
|
| 23 |
+
import requests
|
| 24 |
+
from openai import OpenAI
|
| 25 |
+
|
| 26 |
+
# ── Config ────────────────────────────────────────────────────────────────────
|
| 27 |
+
|
| 28 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 29 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 30 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 31 |
+
ENV_URL = os.getenv("PING_URL", "http://localhost:8000")
|
| 32 |
+
BENCHMARK = "indic_scripture_qa"
|
| 33 |
+
TEMPERATURE = 0.4
|
| 34 |
+
MAX_TOKENS = 600
|
| 35 |
+
|
| 36 |
+
TASKS = [
|
| 37 |
+
{"name": "verify-factual", "max_steps": 5},
|
| 38 |
+
{"name": "correct-and-cite", "max_steps": 8},
|
| 39 |
+
{"name": "fix-hallucination", "max_steps": 12},
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
SYSTEM_PROMPT = textwrap.dedent("""\
|
| 43 |
+
You are an expert agent that both CORRECTS hallucinations and IMPROVES the
|
| 44 |
+
semantic structure of answers about Indic scriptures (Vedas, Upanishads,
|
| 45 |
+
Ramayana, Mahabharata, Bhagavad Gita, Puranas).
|
| 46 |
+
|
| 47 |
+
Each turn you receive an observation with:
|
| 48 |
+
- question, current_answer, retrieved_passages, current_citations,
|
| 49 |
+
steps_remaining, feedback, structural_hints
|
| 50 |
+
|
| 51 |
+
You must reply with EXACTLY ONE JSON object (no markdown, no explanation):
|
| 52 |
+
{
|
| 53 |
+
"action_type": "RETRIEVE" | "EDIT" | "RESTRUCTURE" | "CITE" | "ACCEPT" | "REJECT",
|
| 54 |
+
"payload": "<string or null>"
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
Actions:
|
| 58 |
+
RETRIEVE — fetch source passages to verify facts
|
| 59 |
+
EDIT — rewrite the answer to fix factual errors AND improve content
|
| 60 |
+
RESTRUCTURE — reorganise the answer's flow, ordering, and coherence WITHOUT
|
| 61 |
+
changing facts (use when facts are right but structure is poor)
|
| 62 |
+
CITE — add a scripture citation (e.g. "Bhagavad Gita 2.47")
|
| 63 |
+
ACCEPT — finalise when answer is both accurate and well-structured
|
| 64 |
+
REJECT — only if the answer is fundamentally unsalvageable
|
| 65 |
+
|
| 66 |
+
Strategy:
|
| 67 |
+
1. RETRIEVE first (1–2 times) to get authoritative source passages.
|
| 68 |
+
2. Check facts against retrieved passages. EDIT to fix any errors.
|
| 69 |
+
3. Read structural_hints. If the answer's flow, terminology, or completeness
|
| 70 |
+
is poor, use RESTRUCTURE to reorganise it.
|
| 71 |
+
4. CITE relevant scripture references.
|
| 72 |
+
5. ACCEPT when the answer is factually accurate, well-structured, uses
|
| 73 |
+
correct Sanskrit terminology, and covers all required aspects.
|
| 74 |
+
6. Be efficient — fewer steps score higher.
|
| 75 |
+
|
| 76 |
+
Evaluation axes (the grader checks ALL of these):
|
| 77 |
+
- Factual similarity to ground truth
|
| 78 |
+
- Citation accuracy
|
| 79 |
+
- Terminology precision (correct Sanskrit/domain terms, no misconception markers)
|
| 80 |
+
- Completeness (all required conceptual aspects covered)
|
| 81 |
+
- Logical ordering (concepts in proper sequence)
|
| 82 |
+
- Coherence (smooth transitions, balanced sentence structure)
|
| 83 |
+
""")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ── Logging helpers ───────────────────────────────────────────────────────────
|
| 87 |
+
|
| 88 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 89 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 93 |
+
print(
|
| 94 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} "
|
| 95 |
+
f"done={str(done).lower()} error={error or 'null'}",
|
| 96 |
+
flush=True,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 101 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 102 |
+
print(
|
| 103 |
+
f"[END] success={str(success).lower()} steps={steps} "
|
| 104 |
+
f"score={score:.2f} rewards={rewards_str}",
|
| 105 |
+
flush=True,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# ── Env HTTP helpers ──────────────────────────────────────────────────────────
|
| 110 |
+
|
| 111 |
+
def env_reset(task_name: str, scenario_index: int = 0) -> Dict:
|
| 112 |
+
resp = requests.post(
|
| 113 |
+
f"{ENV_URL}/reset",
|
| 114 |
+
json={"task_name": task_name, "scenario_index": scenario_index},
|
| 115 |
+
timeout=30,
|
| 116 |
+
)
|
| 117 |
+
resp.raise_for_status()
|
| 118 |
+
return resp.json()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def env_step(action_type: str, payload: Optional[str] = None) -> Dict:
|
| 122 |
+
resp = requests.post(
|
| 123 |
+
f"{ENV_URL}/step",
|
| 124 |
+
json={"action_type": action_type, "payload": payload},
|
| 125 |
+
timeout=30,
|
| 126 |
+
)
|
| 127 |
+
resp.raise_for_status()
|
| 128 |
+
return resp.json()
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# ── Agent ─────────────────────────────────────────────────────────────────────
|
| 132 |
+
|
| 133 |
+
def build_user_prompt(obs: Dict, step: int) -> str:
|
| 134 |
+
return json.dumps({
|
| 135 |
+
"step": step,
|
| 136 |
+
"question": obs["question"],
|
| 137 |
+
"current_answer": obs["current_answer"],
|
| 138 |
+
"retrieved_passages": obs["retrieved_passages"],
|
| 139 |
+
"current_citations": obs["current_citations"],
|
| 140 |
+
"steps_remaining": obs["steps_remaining"],
|
| 141 |
+
"feedback": obs.get("feedback"),
|
| 142 |
+
"structural_hints": obs.get("structural_hints", []),
|
| 143 |
+
}, indent=2)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def get_agent_action(client: OpenAI, obs: Dict, step: int) -> Dict:
|
| 147 |
+
"""Ask the LLM for the next action."""
|
| 148 |
+
user_prompt = build_user_prompt(obs, step)
|
| 149 |
+
try:
|
| 150 |
+
completion = client.chat.completions.create(
|
| 151 |
+
model=MODEL_NAME,
|
| 152 |
+
messages=[
|
| 153 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 154 |
+
{"role": "user", "content": user_prompt},
|
| 155 |
+
],
|
| 156 |
+
temperature=TEMPERATURE,
|
| 157 |
+
max_tokens=MAX_TOKENS,
|
| 158 |
+
stream=False,
|
| 159 |
+
)
|
| 160 |
+
raw = (completion.choices[0].message.content or "").strip()
|
| 161 |
+
if raw.startswith("```"):
|
| 162 |
+
raw = raw.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
|
| 163 |
+
return json.loads(raw)
|
| 164 |
+
except Exception as exc:
|
| 165 |
+
print(f"[DEBUG] LLM parse error: {exc}", flush=True)
|
| 166 |
+
if step <= 2:
|
| 167 |
+
return {"action_type": "RETRIEVE", "payload": None}
|
| 168 |
+
return {"action_type": "ACCEPT", "payload": None}
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# ── Main loop ─────────────────────────────────────────────────────────────────
|
| 172 |
+
|
| 173 |
+
def run_task(client: OpenAI, task_name: str, max_steps: int, scenario_index: int = 0) -> float:
|
| 174 |
+
"""Run one episode. Returns score in [0, 1]."""
|
| 175 |
+
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 176 |
+
|
| 177 |
+
rewards: List[float] = []
|
| 178 |
+
steps_taken = 0
|
| 179 |
+
score = 0.0
|
| 180 |
+
success = False
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
result = env_reset(task_name, scenario_index)
|
| 184 |
+
obs = result["observation"]
|
| 185 |
+
|
| 186 |
+
for step in range(1, max_steps + 1):
|
| 187 |
+
if result.get("done", False):
|
| 188 |
+
break
|
| 189 |
+
|
| 190 |
+
agent_action = get_agent_action(client, obs, step)
|
| 191 |
+
action_type = agent_action.get("action_type", "ACCEPT")
|
| 192 |
+
payload = agent_action.get("payload")
|
| 193 |
+
|
| 194 |
+
result = env_step(action_type, payload)
|
| 195 |
+
obs = result["observation"]
|
| 196 |
+
reward = result.get("reward", 0.0)
|
| 197 |
+
done = result.get("done", False)
|
| 198 |
+
|
| 199 |
+
rewards.append(reward)
|
| 200 |
+
steps_taken = step
|
| 201 |
+
|
| 202 |
+
action_str = f"{action_type}({payload!r})" if payload else action_type
|
| 203 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=None)
|
| 204 |
+
|
| 205 |
+
if done:
|
| 206 |
+
score = result.get("info", {}).get("score", 0.0)
|
| 207 |
+
break
|
| 208 |
+
|
| 209 |
+
success = score >= 0.10
|
| 210 |
+
|
| 211 |
+
except Exception as exc:
|
| 212 |
+
print(f"[DEBUG] Episode error: {exc}", flush=True)
|
| 213 |
+
|
| 214 |
+
finally:
|
| 215 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 216 |
+
|
| 217 |
+
return score
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def main() -> None:
|
| 221 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 222 |
+
|
| 223 |
+
all_scores: Dict[str, float] = {}
|
| 224 |
+
|
| 225 |
+
for task in TASKS:
|
| 226 |
+
task_name = task["name"]
|
| 227 |
+
max_steps = task["max_steps"]
|
| 228 |
+
score = run_task(client, task_name, max_steps, scenario_index=0)
|
| 229 |
+
all_scores[task_name] = score
|
| 230 |
+
print(flush=True)
|
| 231 |
+
|
| 232 |
+
print("=" * 60, flush=True)
|
| 233 |
+
print("BASELINE RESULTS", flush=True)
|
| 234 |
+
for name, sc in all_scores.items():
|
| 235 |
+
print(f" {name:25s} score={sc:.3f}", flush=True)
|
| 236 |
+
avg = sum(all_scores.values()) / len(all_scores) if all_scores else 0.0
|
| 237 |
+
print(f" {'AVERAGE':25s} score={avg:.3f}", flush=True)
|
| 238 |
+
print("=" * 60, flush=True)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
if __name__ == "__main__":
|
| 242 |
+
main()
|
main.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI server for IndicScriptureQA — OpenEnv compatible.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
POST /reset — start a new episode
|
| 6 |
+
POST /step — take an action
|
| 7 |
+
GET /state — get current environment state
|
| 8 |
+
GET /health — liveness check
|
| 9 |
+
GET /tasks — list available tasks
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
from fastapi import FastAPI, HTTPException
|
| 17 |
+
from pydantic import BaseModel
|
| 18 |
+
|
| 19 |
+
from environment import IndicScriptureQAEnv
|
| 20 |
+
from models import Action, ActionType
|
| 21 |
+
from tasks import TASKS
|
| 22 |
+
|
| 23 |
+
app = FastAPI(
|
| 24 |
+
title="IndicScriptureQA",
|
| 25 |
+
description=(
|
| 26 |
+
"OpenEnv environment for evaluating agents on Indic scripture "
|
| 27 |
+
"hallucination correction AND semantic structure quality."
|
| 28 |
+
),
|
| 29 |
+
version="1.1.0",
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
_env = IndicScriptureQAEnv()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ── Request / Response schemas ────────────────────────────────────────────────
|
| 36 |
+
|
| 37 |
+
class ResetRequest(BaseModel):
|
| 38 |
+
task_name: str = "verify-factual"
|
| 39 |
+
scenario_index: Optional[int] = None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class StepRequest(BaseModel):
|
| 43 |
+
action_type: str # RETRIEVE | EDIT | RESTRUCTURE | CITE | ACCEPT | REJECT
|
| 44 |
+
payload: Optional[str] = None
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class TaskInfo(BaseModel):
|
| 48 |
+
name: str
|
| 49 |
+
description: str
|
| 50 |
+
max_steps: int
|
| 51 |
+
num_scenarios: int
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ── Endpoints ─────────────────────────────────────────────────────────────────
|
| 55 |
+
|
| 56 |
+
@app.post("/reset")
|
| 57 |
+
def reset(body: ResetRequest = ResetRequest()):
|
| 58 |
+
try:
|
| 59 |
+
result = _env.reset(
|
| 60 |
+
task_name=body.task_name,
|
| 61 |
+
scenario_index=body.scenario_index,
|
| 62 |
+
)
|
| 63 |
+
return result.model_dump()
|
| 64 |
+
except ValueError as e:
|
| 65 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@app.post("/step")
|
| 69 |
+
def step(body: StepRequest):
|
| 70 |
+
try:
|
| 71 |
+
action_type = ActionType(body.action_type.upper())
|
| 72 |
+
except ValueError:
|
| 73 |
+
raise HTTPException(
|
| 74 |
+
status_code=400,
|
| 75 |
+
detail=f"Invalid action_type {body.action_type!r}. Must be one of: {[a.value for a in ActionType]}",
|
| 76 |
+
)
|
| 77 |
+
try:
|
| 78 |
+
action = Action(action_type=action_type, payload=body.payload)
|
| 79 |
+
result = _env.step(action)
|
| 80 |
+
return result.model_dump()
|
| 81 |
+
except RuntimeError as e:
|
| 82 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@app.get("/state")
|
| 86 |
+
def state():
|
| 87 |
+
try:
|
| 88 |
+
s = _env.state()
|
| 89 |
+
return s.model_dump()
|
| 90 |
+
except RuntimeError as e:
|
| 91 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@app.get("/health")
|
| 95 |
+
def health():
|
| 96 |
+
return {"status": "ok"}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@app.get("/tasks")
|
| 100 |
+
def list_tasks():
|
| 101 |
+
return [
|
| 102 |
+
TaskInfo(
|
| 103 |
+
name=cfg.name,
|
| 104 |
+
description=cfg.description,
|
| 105 |
+
max_steps=cfg.max_steps,
|
| 106 |
+
num_scenarios=len(cfg.scenarios),
|
| 107 |
+
).model_dump()
|
| 108 |
+
for cfg in TASKS.values()
|
| 109 |
+
]
|
models.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed Pydantic models for the IndicScriptureQA environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from enum import Enum
|
| 6 |
+
from typing import Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ── Action Space ──────────────────────────────────────────────────────────────
|
| 12 |
+
|
| 13 |
+
class ActionType(str, Enum):
|
| 14 |
+
RETRIEVE = "RETRIEVE" # Retrieve source passages (payload = optional query)
|
| 15 |
+
EDIT = "EDIT" # Replace current answer (payload = new answer text)
|
| 16 |
+
RESTRUCTURE = "RESTRUCTURE" # Reorganise answer flow (payload = restructured text)
|
| 17 |
+
CITE = "CITE" # Attach a citation (payload = e.g. "Bhagavad Gita 2.47")
|
| 18 |
+
ACCEPT = "ACCEPT" # Accept answer as final (terminal)
|
| 19 |
+
REJECT = "REJECT" # Reject answer entirely (terminal)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class Action(BaseModel):
|
| 23 |
+
action_type: ActionType
|
| 24 |
+
payload: Optional[str] = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ── Structural metadata (hidden from agent, used by grader) ──────────────────
|
| 28 |
+
|
| 29 |
+
class StructuralMeta(BaseModel):
|
| 30 |
+
"""Describes the *expected* semantic structure of a correct answer."""
|
| 31 |
+
required_terms: List[str] = Field(
|
| 32 |
+
default_factory=list,
|
| 33 |
+
description="Sanskrit / domain terms the answer must contain.",
|
| 34 |
+
)
|
| 35 |
+
required_sections: List[str] = Field(
|
| 36 |
+
default_factory=list,
|
| 37 |
+
description="Conceptual aspects the answer should cover (order-independent).",
|
| 38 |
+
)
|
| 39 |
+
expected_order: List[str] = Field(
|
| 40 |
+
default_factory=list,
|
| 41 |
+
description="Concepts that should appear in this logical sequence.",
|
| 42 |
+
)
|
| 43 |
+
banned_terms: List[str] = Field(
|
| 44 |
+
default_factory=list,
|
| 45 |
+
description="Terms that indicate a common misconception if present.",
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ── Observation Space ─────────────────────────────────────────────────────────
|
| 50 |
+
|
| 51 |
+
class Observation(BaseModel):
|
| 52 |
+
question: str
|
| 53 |
+
current_answer: str
|
| 54 |
+
retrieved_passages: List[str] = Field(default_factory=list)
|
| 55 |
+
current_citations: List[str] = Field(default_factory=list)
|
| 56 |
+
steps_remaining: int
|
| 57 |
+
task_name: str
|
| 58 |
+
feedback: Optional[str] = None
|
| 59 |
+
# structural hints exposed to the agent (non-spoiler)
|
| 60 |
+
structural_hints: List[str] = Field(
|
| 61 |
+
default_factory=list,
|
| 62 |
+
description="High-level hints about expected answer structure.",
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ── Step Result ───────────────────────────────────────────────────────────────
|
| 67 |
+
|
| 68 |
+
class StepResult(BaseModel):
|
| 69 |
+
observation: Observation
|
| 70 |
+
reward: float = 0.0
|
| 71 |
+
done: bool = False
|
| 72 |
+
info: dict = Field(default_factory=dict)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ── Internal State (superset of observation + grading internals) ──────────────
|
| 76 |
+
|
| 77 |
+
class EnvState(BaseModel):
|
| 78 |
+
# observable
|
| 79 |
+
question: str
|
| 80 |
+
current_answer: str
|
| 81 |
+
retrieved_passages: List[str] = Field(default_factory=list)
|
| 82 |
+
current_citations: List[str] = Field(default_factory=list)
|
| 83 |
+
steps_remaining: int = 0
|
| 84 |
+
task_name: str = ""
|
| 85 |
+
feedback: Optional[str] = None
|
| 86 |
+
structural_hints: List[str] = Field(default_factory=list)
|
| 87 |
+
|
| 88 |
+
# hidden / grading — factual
|
| 89 |
+
original_answer: str = ""
|
| 90 |
+
ground_truth_answer: str = ""
|
| 91 |
+
ground_truth_citations: List[str] = Field(default_factory=list)
|
| 92 |
+
available_passages: List[str] = Field(default_factory=list)
|
| 93 |
+
answer_is_correct: bool = False # overall: facts AND structure both OK
|
| 94 |
+
factual_is_correct: bool = False # facts alone are OK (structure may be bad)
|
| 95 |
+
|
| 96 |
+
# hidden / grading — structural
|
| 97 |
+
structural_meta: StructuralMeta = Field(default_factory=StructuralMeta)
|
| 98 |
+
|
| 99 |
+
# episode bookkeeping
|
| 100 |
+
step_count: int = 0
|
| 101 |
+
max_steps: int = 8
|
| 102 |
+
done: bool = False
|
| 103 |
+
cumulative_reward: float = 0.0
|
| 104 |
+
rewards: List[float] = Field(default_factory=list)
|
| 105 |
+
retrieval_count: int = 0
|
| 106 |
+
edit_count: int = 0
|
| 107 |
+
restructure_count: int = 0
|
| 108 |
+
|
| 109 |
+
def to_observation(self) -> Observation:
|
| 110 |
+
return Observation(
|
| 111 |
+
question=self.question,
|
| 112 |
+
current_answer=self.current_answer,
|
| 113 |
+
retrieved_passages=list(self.retrieved_passages),
|
| 114 |
+
current_citations=list(self.current_citations),
|
| 115 |
+
steps_remaining=self.steps_remaining,
|
| 116 |
+
task_name=self.task_name,
|
| 117 |
+
feedback=self.feedback,
|
| 118 |
+
structural_hints=list(self.structural_hints),
|
| 119 |
+
)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: indic_scripture_qa
|
| 2 |
+
version: "1.1.0"
|
| 3 |
+
description: >
|
| 4 |
+
An RL environment for evaluating language agents on Indic scripture
|
| 5 |
+
hallucination detection, correction, AND semantic structure quality.
|
| 6 |
+
Agents receive a question and a potentially flawed answer — which may
|
| 7 |
+
contain factual hallucinations, structural incoherence, missing
|
| 8 |
+
terminology, or poor logical ordering — and must retrieve source passages,
|
| 9 |
+
edit, restructure, cite, and decide whether to accept or reject.
|
| 10 |
+
|
| 11 |
+
author: kishlay-notabot
|
| 12 |
+
license: MIT
|
| 13 |
+
|
| 14 |
+
env:
|
| 15 |
+
module: main:app
|
| 16 |
+
port: 8000
|
| 17 |
+
health_endpoint: /health
|
| 18 |
+
|
| 19 |
+
action_space:
|
| 20 |
+
type: object
|
| 21 |
+
fields:
|
| 22 |
+
action_type:
|
| 23 |
+
type: string
|
| 24 |
+
enum: [RETRIEVE, EDIT, RESTRUCTURE, CITE, ACCEPT, REJECT]
|
| 25 |
+
description: >
|
| 26 |
+
RETRIEVE fetches source passages. EDIT fixes factual content.
|
| 27 |
+
RESTRUCTURE reorganises flow/coherence without changing facts.
|
| 28 |
+
CITE adds a scripture reference. ACCEPT/REJECT are terminal.
|
| 29 |
+
payload:
|
| 30 |
+
type: string
|
| 31 |
+
nullable: true
|
| 32 |
+
description: >
|
| 33 |
+
Content for the action — query for RETRIEVE, new answer text for
|
| 34 |
+
EDIT or RESTRUCTURE, citation string for CITE, unused for ACCEPT/REJECT.
|
| 35 |
+
|
| 36 |
+
observation_space:
|
| 37 |
+
type: object
|
| 38 |
+
fields:
|
| 39 |
+
question:
|
| 40 |
+
type: string
|
| 41 |
+
current_answer:
|
| 42 |
+
type: string
|
| 43 |
+
retrieved_passages:
|
| 44 |
+
type: array
|
| 45 |
+
items: { type: string }
|
| 46 |
+
current_citations:
|
| 47 |
+
type: array
|
| 48 |
+
items: { type: string }
|
| 49 |
+
steps_remaining:
|
| 50 |
+
type: integer
|
| 51 |
+
task_name:
|
| 52 |
+
type: string
|
| 53 |
+
feedback:
|
| 54 |
+
type: string
|
| 55 |
+
nullable: true
|
| 56 |
+
structural_hints:
|
| 57 |
+
type: array
|
| 58 |
+
items: { type: string }
|
| 59 |
+
description: >
|
| 60 |
+
Non-spoiler hints about expected answer structure (e.g. required
|
| 61 |
+
terminology, conceptual ordering, completeness expectations).
|
| 62 |
+
|
| 63 |
+
tasks:
|
| 64 |
+
- name: verify-factual
|
| 65 |
+
description: >
|
| 66 |
+
Easy — Verify whether a given answer is factually correct and
|
| 67 |
+
structurally sound. Accept good answers, reject or fix bad ones.
|
| 68 |
+
max_steps: 5
|
| 69 |
+
num_scenarios: 5
|
| 70 |
+
|
| 71 |
+
- name: correct-and-cite
|
| 72 |
+
description: >
|
| 73 |
+
Medium — Improve a partially correct answer by fixing factual gaps,
|
| 74 |
+
restructuring for coherence, adding proper terminology, and citing
|
| 75 |
+
scripture sources.
|
| 76 |
+
max_steps: 8
|
| 77 |
+
num_scenarios: 5
|
| 78 |
+
|
| 79 |
+
- name: fix-hallucination
|
| 80 |
+
description: >
|
| 81 |
+
Hard — Detect subtle hallucinations, fix factual errors, correct
|
| 82 |
+
misused Sanskrit terms, reorder logical flow, and ensure complete
|
| 83 |
+
coverage of required conceptual aspects.
|
| 84 |
+
max_steps: 12
|
| 85 |
+
num_scenarios: 5
|
| 86 |
+
|
| 87 |
+
reward:
|
| 88 |
+
range: [0.0, 1.0]
|
| 89 |
+
components:
|
| 90 |
+
factual:
|
| 91 |
+
weight: 0.43
|
| 92 |
+
description: >
|
| 93 |
+
Token-F1 similarity to ground truth answer (×0.90) plus citation
|
| 94 |
+
recall (×0.30).
|
| 95 |
+
structural:
|
| 96 |
+
weight: 0.33
|
| 97 |
+
description: >
|
| 98 |
+
Composite of four axes: terminology precision (0.30), completeness
|
| 99 |
+
(0.25), logical ordering (0.25), and coherence (0.20). Includes
|
| 100 |
+
banned-term penalties for common misconceptions.
|
| 101 |
+
efficiency:
|
| 102 |
+
weight: 0.10
|
| 103 |
+
description: Bonus for using fewer steps.
|
| 104 |
+
step_shaping:
|
| 105 |
+
weight: 0.14
|
| 106 |
+
description: >
|
| 107 |
+
Per-step signals: +0.05 for useful retrieval, +0.20–0.50 for
|
| 108 |
+
quality edits/restructures, +0.15 for correct citations, with
|
| 109 |
+
penalties for redundancy and degradation.
|
requirements-server.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.110.0
|
| 2 |
+
uvicorn[standard]>=0.27.0
|
| 3 |
+
pydantic>=2.0.0
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai>=1.0.0
|
| 2 |
+
requests>=2.31.0
|
rewards.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reward computation for IndicScriptureQA.
|
| 3 |
+
|
| 4 |
+
Two evaluation axes, weighted into a single scalar:
|
| 5 |
+
A. Factual quality — token-F1 similarity to ground truth, citation recall
|
| 6 |
+
B. Structural quality — coherence, completeness, terminology, ordering
|
| 7 |
+
|
| 8 |
+
All scoring is zero-dependency (no ML models) so the env runs on 2 vCPU / 8 GB.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
from typing import List, Tuple
|
| 15 |
+
|
| 16 |
+
from models import ActionType, EnvState, StructuralMeta
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 20 |
+
# A. FACTUAL SCORING
|
| 21 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 22 |
+
|
| 23 |
+
def _tokenize(text: str) -> List[str]:
|
| 24 |
+
"""Lowercase split on non-alphanumeric (keeps Devanagari chars)."""
|
| 25 |
+
return [t for t in re.split(r"[^a-zA-Z0-9\u0900-\u097F]+", text.lower()) if t]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def token_f1(candidate: str, reference: str) -> float:
|
| 29 |
+
"""Token-level F1 between candidate and reference. Returns 0–1."""
|
| 30 |
+
cand_toks = _tokenize(candidate)
|
| 31 |
+
ref_toks = _tokenize(reference)
|
| 32 |
+
if not cand_toks or not ref_toks:
|
| 33 |
+
return 0.0
|
| 34 |
+
cand_set = set(cand_toks)
|
| 35 |
+
ref_set = set(ref_toks)
|
| 36 |
+
common = cand_set & ref_set
|
| 37 |
+
if not common:
|
| 38 |
+
return 0.0
|
| 39 |
+
precision = len(common) / len(cand_set)
|
| 40 |
+
recall = len(common) / len(ref_set)
|
| 41 |
+
return 2 * precision * recall / (precision + recall)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _normalize_citation(c: str) -> str:
|
| 45 |
+
return re.sub(r"\s+", " ", c.strip().lower())
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def citation_recall(predicted: List[str], ground_truth: List[str]) -> float:
|
| 49 |
+
"""Fraction of ground-truth citations matched (fuzzy substring)."""
|
| 50 |
+
if not ground_truth:
|
| 51 |
+
return 1.0
|
| 52 |
+
gt_norms = [_normalize_citation(g) for g in ground_truth]
|
| 53 |
+
pred_norms = [_normalize_citation(p) for p in predicted]
|
| 54 |
+
matched = 0
|
| 55 |
+
for gt in gt_norms:
|
| 56 |
+
for pred in pred_norms:
|
| 57 |
+
if gt in pred or pred in gt:
|
| 58 |
+
matched += 1
|
| 59 |
+
break
|
| 60 |
+
return matched / len(gt_norms)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 64 |
+
# B. STRUCTURAL SCORING
|
| 65 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 66 |
+
|
| 67 |
+
# ── B1. Terminology precision ────────────────────────────────────────────────
|
| 68 |
+
|
| 69 |
+
def terminology_score(answer: str, meta: StructuralMeta) -> float:
|
| 70 |
+
"""
|
| 71 |
+
Checks:
|
| 72 |
+
+ required_terms present → recall over required_terms
|
| 73 |
+
- banned_terms present → hard penalty per banned term found
|
| 74 |
+
Returns float in [-1.0, 1.0].
|
| 75 |
+
"""
|
| 76 |
+
answer_lower = answer.lower()
|
| 77 |
+
|
| 78 |
+
# required term recall
|
| 79 |
+
if meta.required_terms:
|
| 80 |
+
hits = sum(1 for t in meta.required_terms if t.lower() in answer_lower)
|
| 81 |
+
term_recall = hits / len(meta.required_terms)
|
| 82 |
+
else:
|
| 83 |
+
term_recall = 1.0
|
| 84 |
+
|
| 85 |
+
# banned term penalty
|
| 86 |
+
ban_penalty = 0.0
|
| 87 |
+
if meta.banned_terms:
|
| 88 |
+
for bt in meta.banned_terms:
|
| 89 |
+
if bt.lower() in answer_lower:
|
| 90 |
+
ban_penalty += 0.25
|
| 91 |
+
ban_penalty = min(ban_penalty, 1.0)
|
| 92 |
+
|
| 93 |
+
return term_recall - ban_penalty
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# ── B2. Completeness (section coverage) ──────────────────────────────────────
|
| 97 |
+
|
| 98 |
+
def completeness_score(answer: str, meta: StructuralMeta) -> float:
|
| 99 |
+
"""
|
| 100 |
+
Heuristic: for each required_section, check whether characteristic
|
| 101 |
+
keywords from that section label appear in the answer.
|
| 102 |
+
Returns 0–1 (fraction of sections covered).
|
| 103 |
+
"""
|
| 104 |
+
if not meta.required_sections:
|
| 105 |
+
return 1.0
|
| 106 |
+
answer_lower = answer.lower()
|
| 107 |
+
covered = 0
|
| 108 |
+
for section in meta.required_sections:
|
| 109 |
+
# use the keywords from the section label itself
|
| 110 |
+
section_keywords = _tokenize(section)
|
| 111 |
+
# count a section as covered if ≥ half its keywords appear
|
| 112 |
+
if section_keywords:
|
| 113 |
+
hits = sum(1 for kw in section_keywords if kw in answer_lower)
|
| 114 |
+
if hits / len(section_keywords) >= 0.5:
|
| 115 |
+
covered += 1
|
| 116 |
+
return covered / len(meta.required_sections)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ── B3. Logical ordering (sequence adherence) ────────────────────────────────
|
| 120 |
+
|
| 121 |
+
def ordering_score(answer: str, meta: StructuralMeta) -> float:
|
| 122 |
+
"""
|
| 123 |
+
Checks whether concepts in expected_order appear in the correct sequence
|
| 124 |
+
in the answer. Uses first-occurrence position of each concept's keywords.
|
| 125 |
+
Returns 0–1.
|
| 126 |
+
"""
|
| 127 |
+
if len(meta.expected_order) < 2:
|
| 128 |
+
return 1.0
|
| 129 |
+
|
| 130 |
+
answer_lower = answer.lower()
|
| 131 |
+
positions: List[int] = []
|
| 132 |
+
|
| 133 |
+
for concept in meta.expected_order:
|
| 134 |
+
keywords = _tokenize(concept)
|
| 135 |
+
# find earliest position of any keyword
|
| 136 |
+
earliest = len(answer_lower) + 1
|
| 137 |
+
for kw in keywords:
|
| 138 |
+
idx = answer_lower.find(kw)
|
| 139 |
+
if idx != -1 and idx < earliest:
|
| 140 |
+
earliest = idx
|
| 141 |
+
positions.append(earliest)
|
| 142 |
+
|
| 143 |
+
# count correctly ordered adjacent pairs
|
| 144 |
+
correct_pairs = sum(
|
| 145 |
+
1 for i in range(len(positions) - 1) if positions[i] <= positions[i + 1]
|
| 146 |
+
)
|
| 147 |
+
return correct_pairs / (len(positions) - 1)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# ── B4. Coherence (transition quality + sentence structure) ──────────────────
|
| 151 |
+
|
| 152 |
+
_TRANSITION_MARKERS = {
|
| 153 |
+
"therefore", "however", "moreover", "furthermore", "thus", "consequently",
|
| 154 |
+
"specifically", "in contrast", "for example", "similarly", "additionally",
|
| 155 |
+
"because", "since", "although", "while", "first", "second", "third",
|
| 156 |
+
"finally", "in particular", "notably", "according to", "this means",
|
| 157 |
+
"as a result", "in other words",
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def coherence_score(answer: str) -> float:
|
| 162 |
+
"""
|
| 163 |
+
Lightweight coherence proxy:
|
| 164 |
+
- Sentence count (more than 1 sentence expected)
|
| 165 |
+
- Transition markers (discourse connectives)
|
| 166 |
+
- Sentence-length variance (very uneven → lower coherence)
|
| 167 |
+
Returns 0–1.
|
| 168 |
+
"""
|
| 169 |
+
sentences = [s.strip() for s in re.split(r"[.!?]+", answer) if s.strip()]
|
| 170 |
+
if len(sentences) <= 1:
|
| 171 |
+
return 0.3 # single sentence is structurally weak for these tasks
|
| 172 |
+
|
| 173 |
+
# transition marker density
|
| 174 |
+
answer_lower = answer.lower()
|
| 175 |
+
marker_count = sum(1 for m in _TRANSITION_MARKERS if m in answer_lower)
|
| 176 |
+
marker_density = min(marker_count / max(len(sentences) - 1, 1), 1.0)
|
| 177 |
+
|
| 178 |
+
# sentence length variance (normalised). Very uneven → incoherent.
|
| 179 |
+
lengths = [len(s.split()) for s in sentences]
|
| 180 |
+
mean_len = sum(lengths) / len(lengths)
|
| 181 |
+
if mean_len == 0:
|
| 182 |
+
return 0.2
|
| 183 |
+
variance = sum((l - mean_len) ** 2 for l in lengths) / len(lengths)
|
| 184 |
+
cv = (variance ** 0.5) / mean_len # coefficient of variation
|
| 185 |
+
uniformity = max(0.0, 1.0 - cv) # lower CV → more uniform → higher score
|
| 186 |
+
|
| 187 |
+
# blend: 50 % markers, 30 % uniformity, 20 % baseline for multi-sentence
|
| 188 |
+
return 0.5 * marker_density + 0.3 * uniformity + 0.2
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ── Composite structural score ───────────────────────────────────────────────
|
| 192 |
+
|
| 193 |
+
def structural_quality(answer: str, meta: StructuralMeta) -> Tuple[float, dict]:
|
| 194 |
+
"""
|
| 195 |
+
Weighted composite of all structural axes.
|
| 196 |
+
Returns (score_0_to_1, breakdown_dict).
|
| 197 |
+
"""
|
| 198 |
+
ts = terminology_score(answer, meta)
|
| 199 |
+
cs = completeness_score(answer, meta)
|
| 200 |
+
os_ = ordering_score(answer, meta)
|
| 201 |
+
coh = coherence_score(answer)
|
| 202 |
+
|
| 203 |
+
# weights
|
| 204 |
+
composite = (
|
| 205 |
+
0.30 * max(ts, 0.0) # terminology (clamp negatives to 0 for composite)
|
| 206 |
+
+ 0.25 * cs # completeness
|
| 207 |
+
+ 0.25 * os_ # ordering
|
| 208 |
+
+ 0.20 * coh # coherence
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# apply banned-term penalty on top
|
| 212 |
+
if ts < 0:
|
| 213 |
+
composite += 0.15 * ts # propagate penalty
|
| 214 |
+
|
| 215 |
+
composite = max(0.0, min(1.0, composite))
|
| 216 |
+
|
| 217 |
+
breakdown = {
|
| 218 |
+
"terminology": round(ts, 3),
|
| 219 |
+
"completeness": round(cs, 3),
|
| 220 |
+
"ordering": round(os_, 3),
|
| 221 |
+
"coherence": round(coh, 3),
|
| 222 |
+
"composite": round(composite, 3),
|
| 223 |
+
}
|
| 224 |
+
return composite, breakdown
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 228 |
+
# PER-STEP REWARD
|
| 229 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 230 |
+
|
| 231 |
+
def step_reward(state: EnvState, action_type: ActionType, payload: str | None) -> Tuple[float, str]:
|
| 232 |
+
"""
|
| 233 |
+
Compute per-step reward and feedback message.
|
| 234 |
+
Now accounts for structural improvement on EDIT and RESTRUCTURE.
|
| 235 |
+
"""
|
| 236 |
+
reward = 0.0
|
| 237 |
+
feedback = ""
|
| 238 |
+
|
| 239 |
+
if action_type == ActionType.RETRIEVE:
|
| 240 |
+
if state.retrieval_count >= 3:
|
| 241 |
+
reward = -0.15
|
| 242 |
+
feedback = "Redundant retrieval — you've already retrieved 3 times."
|
| 243 |
+
elif state.available_passages:
|
| 244 |
+
reward = 0.05
|
| 245 |
+
feedback = "Passages retrieved."
|
| 246 |
+
else:
|
| 247 |
+
reward = -0.05
|
| 248 |
+
feedback = "No passages available for retrieval."
|
| 249 |
+
|
| 250 |
+
elif action_type == ActionType.EDIT:
|
| 251 |
+
if not payload:
|
| 252 |
+
reward = -0.10
|
| 253 |
+
feedback = "Empty edit — no content provided."
|
| 254 |
+
else:
|
| 255 |
+
# factual delta
|
| 256 |
+
old_sim = token_f1(state.current_answer, state.ground_truth_answer)
|
| 257 |
+
new_sim = token_f1(payload, state.ground_truth_answer)
|
| 258 |
+
fact_delta = new_sim - old_sim
|
| 259 |
+
|
| 260 |
+
# structural delta
|
| 261 |
+
old_struct, _ = structural_quality(state.current_answer, state.structural_meta)
|
| 262 |
+
new_struct, bk = structural_quality(payload, state.structural_meta)
|
| 263 |
+
struct_delta = new_struct - old_struct
|
| 264 |
+
|
| 265 |
+
combined_delta = 0.6 * fact_delta + 0.4 * struct_delta
|
| 266 |
+
|
| 267 |
+
if combined_delta > 0.03:
|
| 268 |
+
reward = 0.20 + combined_delta
|
| 269 |
+
feedback = f"Edit improved answer (fact Δ{fact_delta:+.2f}, struct Δ{struct_delta:+.2f})."
|
| 270 |
+
elif combined_delta < -0.03:
|
| 271 |
+
reward = -0.20
|
| 272 |
+
feedback = f"Edit degraded answer (fact Δ{fact_delta:+.2f}, struct Δ{struct_delta:+.2f})."
|
| 273 |
+
else:
|
| 274 |
+
reward = -0.05
|
| 275 |
+
feedback = "Edit had negligible effect."
|
| 276 |
+
|
| 277 |
+
elif action_type == ActionType.RESTRUCTURE:
|
| 278 |
+
if not payload:
|
| 279 |
+
reward = -0.10
|
| 280 |
+
feedback = "Empty restructure — no content provided."
|
| 281 |
+
else:
|
| 282 |
+
# restructure should preserve facts but improve structure
|
| 283 |
+
old_sim = token_f1(state.current_answer, state.ground_truth_answer)
|
| 284 |
+
new_sim = token_f1(payload, state.ground_truth_answer)
|
| 285 |
+
fact_delta = new_sim - old_sim
|
| 286 |
+
|
| 287 |
+
old_struct, _ = structural_quality(state.current_answer, state.structural_meta)
|
| 288 |
+
new_struct, bk = structural_quality(payload, state.structural_meta)
|
| 289 |
+
struct_delta = new_struct - old_struct
|
| 290 |
+
|
| 291 |
+
if fact_delta < -0.10:
|
| 292 |
+
# restructure destroyed factual content
|
| 293 |
+
reward = -0.25
|
| 294 |
+
feedback = f"Restructure lost factual content (fact Δ{fact_delta:+.2f}). Use EDIT if changing facts."
|
| 295 |
+
elif struct_delta > 0.05:
|
| 296 |
+
reward = 0.25 + struct_delta
|
| 297 |
+
feedback = (
|
| 298 |
+
f"Restructure improved structure (Δ{struct_delta:+.2f}). "
|
| 299 |
+
f"Breakdown: term={bk['terminology']:.2f} comp={bk['completeness']:.2f} "
|
| 300 |
+
f"order={bk['ordering']:.2f} coh={bk['coherence']:.2f}"
|
| 301 |
+
)
|
| 302 |
+
elif struct_delta < -0.03:
|
| 303 |
+
reward = -0.15
|
| 304 |
+
feedback = f"Restructure degraded structure (Δ{struct_delta:+.2f})."
|
| 305 |
+
else:
|
| 306 |
+
reward = -0.05
|
| 307 |
+
feedback = "Restructure had negligible structural effect."
|
| 308 |
+
|
| 309 |
+
elif action_type == ActionType.CITE:
|
| 310 |
+
if not payload:
|
| 311 |
+
reward = -0.05
|
| 312 |
+
feedback = "Empty citation."
|
| 313 |
+
else:
|
| 314 |
+
cr = citation_recall([payload], state.ground_truth_citations)
|
| 315 |
+
if cr > 0:
|
| 316 |
+
reward = 0.15
|
| 317 |
+
feedback = "Correct citation added."
|
| 318 |
+
else:
|
| 319 |
+
reward = -0.05
|
| 320 |
+
feedback = "Citation does not match expected sources."
|
| 321 |
+
|
| 322 |
+
elif action_type in (ActionType.ACCEPT, ActionType.REJECT):
|
| 323 |
+
pass # terminal rewards handled separately
|
| 324 |
+
|
| 325 |
+
return reward, feedback
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 329 |
+
# TERMINAL REWARD
|
| 330 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 331 |
+
|
| 332 |
+
def terminal_reward(state: EnvState, action_type: ActionType) -> Tuple[float, str]:
|
| 333 |
+
"""
|
| 334 |
+
Terminal reward blends factual quality AND structural quality.
|
| 335 |
+
"""
|
| 336 |
+
if action_type == ActionType.REJECT:
|
| 337 |
+
if not state.answer_is_correct:
|
| 338 |
+
return 0.30, "Correctly rejected a flawed answer."
|
| 339 |
+
else:
|
| 340 |
+
return -0.50, "Incorrectly rejected a valid answer."
|
| 341 |
+
|
| 342 |
+
# ── ACCEPT ────────────────────────────────────────────────────────────
|
| 343 |
+
# factual component
|
| 344 |
+
answer_sim = token_f1(state.current_answer, state.ground_truth_answer)
|
| 345 |
+
cit_score = citation_recall(state.current_citations, state.ground_truth_citations)
|
| 346 |
+
|
| 347 |
+
# structural component
|
| 348 |
+
struct_score, struct_breakdown = structural_quality(
|
| 349 |
+
state.current_answer, state.structural_meta
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
# efficiency bonus (0–0.2)
|
| 353 |
+
efficiency = 0.20 * (state.steps_remaining / state.max_steps)
|
| 354 |
+
|
| 355 |
+
# weighted terminal reward
|
| 356 |
+
terminal = (
|
| 357 |
+
0.90 * answer_sim # factual similarity (max 0.90)
|
| 358 |
+
+ 0.30 * cit_score # citation recall (max 0.30)
|
| 359 |
+
+ 0.70 * struct_score # structural quality (max 0.70)
|
| 360 |
+
+ efficiency # efficiency bonus (max 0.20)
|
| 361 |
+
)
|
| 362 |
+
# theoretical max ≈ 2.10
|
| 363 |
+
|
| 364 |
+
# penalty for accepting a still-bad answer
|
| 365 |
+
if answer_sim < 0.3 and struct_score < 0.3:
|
| 366 |
+
terminal -= 0.50
|
| 367 |
+
quality_label = "poor"
|
| 368 |
+
elif answer_sim < 0.5:
|
| 369 |
+
quality_label = "mediocre"
|
| 370 |
+
else:
|
| 371 |
+
quality_label = "good"
|
| 372 |
+
|
| 373 |
+
feedback = (
|
| 374 |
+
f"Accepted a {quality_label} answer "
|
| 375 |
+
f"(fact={answer_sim:.2f}, cite={cit_score:.2f}, struct={struct_score:.2f} "
|
| 376 |
+
f"[term={struct_breakdown['terminology']:.2f} "
|
| 377 |
+
f"comp={struct_breakdown['completeness']:.2f} "
|
| 378 |
+
f"ord={struct_breakdown['ordering']:.2f} "
|
| 379 |
+
f"coh={struct_breakdown['coherence']:.2f}])"
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
return terminal, feedback
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 386 |
+
# SCORE NORMALISATION
|
| 387 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 388 |
+
|
| 389 |
+
# theoretical max: terminal ~2.10 + step bonuses ~0.5 ≈ 2.6
|
| 390 |
+
MAX_REASONABLE_REWARD = 2.80
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
def normalize_score(cumulative_reward: float) -> float:
|
| 394 |
+
"""Clamp cumulative reward into [0, 1]."""
|
| 395 |
+
score = cumulative_reward / MAX_REASONABLE_REWARD
|
| 396 |
+
return max(0.0, min(1.0, score))
|
tasks.py
ADDED
|
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task definitions for IndicScriptureQA.
|
| 3 |
+
|
| 4 |
+
Each scenario now carries *structural metadata* alongside factual ground truth.
|
| 5 |
+
The grader evaluates BOTH factual accuracy AND semantic structure:
|
| 6 |
+
- required_terms → Sanskrit/domain terms the answer must use
|
| 7 |
+
- required_sections → conceptual aspects that must be covered
|
| 8 |
+
- expected_order → logical ordering of concepts
|
| 9 |
+
- banned_terms → misconception markers (penalty if present)
|
| 10 |
+
|
| 11 |
+
Difficulty controls:
|
| 12 |
+
easy → blatant factual error OR correct answer; max 5 steps
|
| 13 |
+
medium → partial errors + missing citations + poor structure; max 8 steps
|
| 14 |
+
hard → subtle hallucinations + jumbled structure + terminology misuse; max 12 steps
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import dataclasses
|
| 20 |
+
from typing import Dict, List, Optional
|
| 21 |
+
|
| 22 |
+
from models import StructuralMeta
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclasses.dataclass
|
| 26 |
+
class Scenario:
|
| 27 |
+
question: str
|
| 28 |
+
given_answer: str
|
| 29 |
+
ground_truth_answer: str
|
| 30 |
+
ground_truth_citations: List[str]
|
| 31 |
+
available_passages: List[str]
|
| 32 |
+
answer_is_correct: bool # overall (facts + structure)
|
| 33 |
+
factual_is_correct: bool # facts alone
|
| 34 |
+
structural_meta: StructuralMeta
|
| 35 |
+
structural_hints: List[str] # non-spoiler hints shown to agent
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 39 |
+
# TASK 1 — verify-factual (Easy)
|
| 40 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 41 |
+
|
| 42 |
+
VERIFY_FACTUAL_SCENARIOS: List[Scenario] = [
|
| 43 |
+
Scenario(
|
| 44 |
+
question="Who killed Ravana in the Ramayana?",
|
| 45 |
+
given_answer="Lakshmana killed Ravana with a divine arrow during the battle of Lanka.",
|
| 46 |
+
ground_truth_answer="Rama killed Ravana using the Brahmastra during the battle of Lanka.",
|
| 47 |
+
ground_truth_citations=["Valmiki Ramayana, Yuddha Kanda, Sarga 108"],
|
| 48 |
+
available_passages=[
|
| 49 |
+
"Yuddha Kanda 108: Rama, following Agastya's counsel, invoked the Brahmastra and struck Ravana in the chest, ending his life.",
|
| 50 |
+
"Yuddha Kanda 101: Lakshmana fought Indrajit and defeated him but did not fight Ravana directly in the final duel.",
|
| 51 |
+
],
|
| 52 |
+
answer_is_correct=False,
|
| 53 |
+
factual_is_correct=False,
|
| 54 |
+
structural_meta=StructuralMeta(
|
| 55 |
+
required_terms=["Rama", "Ravana", "Brahmastra"],
|
| 56 |
+
required_sections=["who performed the act", "weapon used", "context of battle"],
|
| 57 |
+
expected_order=["context of battle", "weapon used", "outcome"],
|
| 58 |
+
banned_terms=[],
|
| 59 |
+
),
|
| 60 |
+
structural_hints=["Answer should identify the warrior, the weapon, and the battle context."],
|
| 61 |
+
),
|
| 62 |
+
Scenario(
|
| 63 |
+
question="How many chapters does the Bhagavad Gita contain?",
|
| 64 |
+
given_answer="The Bhagavad Gita contains 18 chapters, each called an Adhyaya.",
|
| 65 |
+
ground_truth_answer="The Bhagavad Gita contains 18 chapters (Adhyayas), comprising 700 verses.",
|
| 66 |
+
ground_truth_citations=["Bhagavad Gita, Chapters 1–18"],
|
| 67 |
+
available_passages=[
|
| 68 |
+
"The Bhagavad Gita is a 700-verse scripture that is part of the Mahabharata (Bhishma Parva, chapters 25–42). It consists of 18 chapters.",
|
| 69 |
+
],
|
| 70 |
+
answer_is_correct=True,
|
| 71 |
+
factual_is_correct=True,
|
| 72 |
+
structural_meta=StructuralMeta(
|
| 73 |
+
required_terms=["Adhyaya", "18"],
|
| 74 |
+
required_sections=["chapter count", "structure note"],
|
| 75 |
+
expected_order=[],
|
| 76 |
+
banned_terms=[],
|
| 77 |
+
),
|
| 78 |
+
structural_hints=["Mention chapter count and the term for chapters."],
|
| 79 |
+
),
|
| 80 |
+
Scenario(
|
| 81 |
+
question="Who narrated the Bhagavad Gita to Arjuna?",
|
| 82 |
+
given_answer="Vyasa narrated the Bhagavad Gita to Arjuna on the battlefield of Kurukshetra.",
|
| 83 |
+
ground_truth_answer="Krishna narrated the Bhagavad Gita to Arjuna on the battlefield of Kurukshetra.",
|
| 84 |
+
ground_truth_citations=["Bhagavad Gita 1.1", "Bhagavad Gita 2.11"],
|
| 85 |
+
available_passages=[
|
| 86 |
+
"Bhagavad Gita 2.11: Sri Bhagavan (Krishna) said — You grieve for those who should not be grieved for.",
|
| 87 |
+
"Vyasa composed the Mahabharata and dictated it to Ganesha but was not the speaker of the Gita.",
|
| 88 |
+
],
|
| 89 |
+
answer_is_correct=False,
|
| 90 |
+
factual_is_correct=False,
|
| 91 |
+
structural_meta=StructuralMeta(
|
| 92 |
+
required_terms=["Krishna", "Arjuna", "Kurukshetra"],
|
| 93 |
+
required_sections=["speaker", "listener", "setting"],
|
| 94 |
+
expected_order=["speaker", "listener", "setting"],
|
| 95 |
+
banned_terms=[],
|
| 96 |
+
),
|
| 97 |
+
structural_hints=["Identify the speaker, the listener, and the setting."],
|
| 98 |
+
),
|
| 99 |
+
Scenario(
|
| 100 |
+
question="What is the first word of the Rigveda?",
|
| 101 |
+
given_answer="The first word of the Rigveda is 'Agnim' (अग्निम्), invoking the fire deity Agni.",
|
| 102 |
+
ground_truth_answer="The first word of the Rigveda is 'Agnim' (अग्निम्), the accusative form of Agni, beginning the hymn to the fire deity.",
|
| 103 |
+
ground_truth_citations=["Rigveda 1.1.1"],
|
| 104 |
+
available_passages=[
|
| 105 |
+
"Rigveda 1.1.1: Agnim ile purohitam yajnasya devam ritvijam — I praise Agni, the foremost priest, the divine minister of the sacrifice.",
|
| 106 |
+
],
|
| 107 |
+
answer_is_correct=True,
|
| 108 |
+
factual_is_correct=True,
|
| 109 |
+
structural_meta=StructuralMeta(
|
| 110 |
+
required_terms=["Agnim", "Agni", "Rigveda"],
|
| 111 |
+
required_sections=["the word", "its meaning", "its significance"],
|
| 112 |
+
expected_order=["the word", "its meaning"],
|
| 113 |
+
banned_terms=[],
|
| 114 |
+
),
|
| 115 |
+
structural_hints=["State the word, explain its grammatical form, and note its significance."],
|
| 116 |
+
),
|
| 117 |
+
Scenario(
|
| 118 |
+
question="In the Mahabharata, who was the commander-in-chief of the Kaurava army on the first day?",
|
| 119 |
+
given_answer="Drona was the first commander-in-chief of the Kaurava army at Kurukshetra.",
|
| 120 |
+
ground_truth_answer="Bhishma was the first commander-in-chief of the Kaurava army, leading from days 1 through 10.",
|
| 121 |
+
ground_truth_citations=["Mahabharata, Bhishma Parva"],
|
| 122 |
+
available_passages=[
|
| 123 |
+
"Bhishma Parva: Bhishma was appointed supreme commander of the Kaurava forces. He led the army for the first ten days of the war.",
|
| 124 |
+
"Drona Parva: After Bhishma fell, Drona was appointed the second commander-in-chief.",
|
| 125 |
+
],
|
| 126 |
+
answer_is_correct=False,
|
| 127 |
+
factual_is_correct=False,
|
| 128 |
+
structural_meta=StructuralMeta(
|
| 129 |
+
required_terms=["Bhishma", "Kaurava"],
|
| 130 |
+
required_sections=["commander identity", "duration of command"],
|
| 131 |
+
expected_order=["commander identity", "duration of command"],
|
| 132 |
+
banned_terms=[],
|
| 133 |
+
),
|
| 134 |
+
structural_hints=["Identify the commander and state how long they held command."],
|
| 135 |
+
),
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 140 |
+
# TASK 2 — correct-and-cite (Medium)
|
| 141 |
+
# Scenarios now include structural problems: missing aspects, wrong ordering,
|
| 142 |
+
# imprecise terminology, alongside the citation/factual issues.
|
| 143 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 144 |
+
|
| 145 |
+
CORRECT_AND_CITE_SCENARIOS: List[Scenario] = [
|
| 146 |
+
Scenario(
|
| 147 |
+
question="What does Krishna say about Karma Yoga in the Bhagavad Gita?",
|
| 148 |
+
given_answer="Krishna tells Arjuna to perform his duty without attachment to results. He says action is superior to inaction and that one should work selflessly.",
|
| 149 |
+
ground_truth_answer="Krishna teaches that one has the right to perform prescribed duties but is not entitled to the fruits of actions. He advocates nishkama karma — selfless action without attachment to outcomes — as the path to liberation.",
|
| 150 |
+
ground_truth_citations=["Bhagavad Gita 2.47", "Bhagavad Gita 3.19"],
|
| 151 |
+
available_passages=[
|
| 152 |
+
"Bhagavad Gita 2.47: Karmanye vadhikaraste ma phaleshu kadachana — You have the right to perform your duty, but you are not entitled to the fruits of your actions.",
|
| 153 |
+
"Bhagavad Gita 3.19: Therefore, without attachment, always perform the work that has to be done; for by performing work without attachment, one attains the Supreme.",
|
| 154 |
+
"Bhagavad Gita 3.4: Not by merely abstaining from action can one achieve freedom from reaction, nor by renunciation alone can one attain perfection.",
|
| 155 |
+
],
|
| 156 |
+
answer_is_correct=False,
|
| 157 |
+
factual_is_correct=False,
|
| 158 |
+
structural_meta=StructuralMeta(
|
| 159 |
+
required_terms=["nishkama karma", "Karma Yoga", "phala"],
|
| 160 |
+
required_sections=["core teaching", "key verse reference", "philosophical implication"],
|
| 161 |
+
expected_order=["core teaching", "key verse reference", "philosophical implication"],
|
| 162 |
+
banned_terms=[],
|
| 163 |
+
),
|
| 164 |
+
structural_hints=[
|
| 165 |
+
"Use the Sanskrit term for selfless action.",
|
| 166 |
+
"Structure: teaching → supporting verse → implication for liberation.",
|
| 167 |
+
],
|
| 168 |
+
),
|
| 169 |
+
Scenario(
|
| 170 |
+
question="What are the four Mahavakyas of the Upanishads?",
|
| 171 |
+
given_answer="The four Mahavakyas are: Prajnanam Brahma, Aham Brahmasmi, Tat Tvam Asi, and Ayam Atma Brahma. They express the identity of the self with Brahman.",
|
| 172 |
+
ground_truth_answer="The four Mahavakyas are: 'Prajnanam Brahma' (Consciousness is Brahman) from Aitareya Upanishad, 'Aham Brahmasmi' (I am Brahman) from Brihadaranyaka Upanishad, 'Tat Tvam Asi' (Thou art That) from Chandogya Upanishad, and 'Ayam Atma Brahma' (This Self is Brahman) from Mandukya Upanishad.",
|
| 173 |
+
ground_truth_citations=[
|
| 174 |
+
"Aitareya Upanishad 3.3",
|
| 175 |
+
"Brihadaranyaka Upanishad 1.4.10",
|
| 176 |
+
"Chandogya Upanishad 6.8.7",
|
| 177 |
+
"Mandukya Upanishad 1.2",
|
| 178 |
+
],
|
| 179 |
+
available_passages=[
|
| 180 |
+
"Aitareya Upanishad 3.3: Prajnanam Brahma — Consciousness is Brahman. This Mahavakya belongs to the Rigveda.",
|
| 181 |
+
"Brihadaranyaka Upanishad 1.4.10: Aham Brahmasmi — I am Brahman. This declaration belongs to the Yajurveda.",
|
| 182 |
+
"Chandogya Upanishad 6.8.7: Tat Tvam Asi — Thou art That. Uddalaka teaches this to Shvetaketu. Belongs to the Samaveda.",
|
| 183 |
+
"Mandukya Upanishad 1.2: Ayam Atma Brahma — This Self is Brahman. Belongs to the Atharvaveda.",
|
| 184 |
+
],
|
| 185 |
+
answer_is_correct=False,
|
| 186 |
+
factual_is_correct=True, # the four names are correct; missing source attribution
|
| 187 |
+
structural_meta=StructuralMeta(
|
| 188 |
+
required_terms=["Mahavakya", "Brahman", "Atman"],
|
| 189 |
+
required_sections=["each vakya with translation", "source Upanishad for each", "unifying theme"],
|
| 190 |
+
expected_order=["vakya list", "unifying theme"],
|
| 191 |
+
banned_terms=[],
|
| 192 |
+
),
|
| 193 |
+
structural_hints=[
|
| 194 |
+
"Each Mahavakya should be paired with its source Upanishad.",
|
| 195 |
+
"Conclude with the unifying Advaita theme.",
|
| 196 |
+
],
|
| 197 |
+
),
|
| 198 |
+
Scenario(
|
| 199 |
+
question="Describe the concept of Dharma as explained in the Mahabharata.",
|
| 200 |
+
given_answer="Dharma in the Mahabharata is presented as a complex moral code. Bhishma explains dharma extensively while lying on the bed of arrows. The text suggests dharma is subtle and context-dependent.",
|
| 201 |
+
ground_truth_answer="The Mahabharata presents dharma as subtle (sukshma) and context-dependent. Bhishma's discourse on dharma in the Shanti Parva and Anushasana Parva covers duties of kings, ethics of war, and personal righteousness. The famous dictum states: 'Dharma is that which sustains all beings.'",
|
| 202 |
+
ground_truth_citations=["Mahabharata, Shanti Parva", "Mahabharata, Anushasana Parva"],
|
| 203 |
+
available_passages=[
|
| 204 |
+
"Shanti Parva: Bhishma, lying on the bed of arrows, instructs Yudhishthira on rajadharma (duties of kings), moksha-dharma, and apaddharma (ethics in emergencies).",
|
| 205 |
+
"Anushasana Parva: Continuation of Bhishma's teachings on dana (charity), dharma, and moral conduct.",
|
| 206 |
+
"Vana Parva 313.117: Dharmo rakshati rakshitah — Dharma protects those who protect dharma.",
|
| 207 |
+
],
|
| 208 |
+
answer_is_correct=False,
|
| 209 |
+
factual_is_correct=True,
|
| 210 |
+
structural_meta=StructuralMeta(
|
| 211 |
+
required_terms=["sukshma", "rajadharma", "apaddharma", "dharma"],
|
| 212 |
+
required_sections=["definition of dharma", "Bhishma's discourse topics", "key dictum"],
|
| 213 |
+
expected_order=["definition of dharma", "Bhishma's discourse topics", "key dictum"],
|
| 214 |
+
banned_terms=[],
|
| 215 |
+
),
|
| 216 |
+
structural_hints=[
|
| 217 |
+
"Use the Sanskrit term for dharma's subtlety.",
|
| 218 |
+
"Cover the three categories Bhishma teaches: rajadharma, moksha-dharma, apaddharma.",
|
| 219 |
+
],
|
| 220 |
+
),
|
| 221 |
+
Scenario(
|
| 222 |
+
question="What is the significance of the number 108 in Hindu scriptures?",
|
| 223 |
+
given_answer="The number 108 is sacred because there are 108 Upanishads in the Muktika canon. Japa malas have 108 beads for chanting.",
|
| 224 |
+
ground_truth_answer="108 is considered sacred in Hinduism for multiple reasons: the Muktika canon lists 108 Upanishads, japa malas contain 108 beads, there are 108 names (ashtottara) for major deities, and mathematically 1×(2²)×(3³)=108. The distance between the Earth and Sun is approximately 108 times the Sun's diameter.",
|
| 225 |
+
ground_truth_citations=["Muktika Upanishad 1.30–39"],
|
| 226 |
+
available_passages=[
|
| 227 |
+
"Muktika Upanishad: Rama narrates 108 Upanishads to Hanuman. The list includes major (mukhya) and minor Upanishads.",
|
| 228 |
+
"In Hindu tradition, 108 appears in mala beads, temple steps, and as the count of names in ashtottara-namavali (108 names of deities like Vishnu, Shiva, Lakshmi).",
|
| 229 |
+
],
|
| 230 |
+
answer_is_correct=False,
|
| 231 |
+
factual_is_correct=True,
|
| 232 |
+
structural_meta=StructuralMeta(
|
| 233 |
+
required_terms=["ashtottara", "Muktika", "japa"],
|
| 234 |
+
required_sections=["scriptural significance", "ritual significance", "mathematical note"],
|
| 235 |
+
expected_order=["scriptural significance", "ritual significance"],
|
| 236 |
+
banned_terms=[],
|
| 237 |
+
),
|
| 238 |
+
structural_hints=[
|
| 239 |
+
"Cover scriptural, ritual, AND mathematical dimensions.",
|
| 240 |
+
"Use the term 'ashtottara-namavali' for the 108-name tradition.",
|
| 241 |
+
],
|
| 242 |
+
),
|
| 243 |
+
Scenario(
|
| 244 |
+
question="Who composed the Yoga Sutras?",
|
| 245 |
+
given_answer="Patanjali composed the Yoga Sutras, a foundational text of Raja Yoga, consisting of 196 sutras organized into four padas.",
|
| 246 |
+
ground_truth_answer="Patanjali composed the Yoga Sutras, a foundational text of Raja Yoga comprising 196 sutras in four padas: Samadhi Pada, Sadhana Pada, Vibhuti Pada, and Kaivalya Pada.",
|
| 247 |
+
ground_truth_citations=["Yoga Sutras of Patanjali 1.1"],
|
| 248 |
+
available_passages=[
|
| 249 |
+
"Yoga Sutras 1.1: Atha yoga-anushasanam — Now, the exposition of Yoga is being made.",
|
| 250 |
+
"The four padas are: Samadhi Pada (51 sutras), Sadhana Pada (55 sutras), Vibhuti Pada (56 sutras), and Kaivalya Pada (34 sutras).",
|
| 251 |
+
],
|
| 252 |
+
answer_is_correct=True,
|
| 253 |
+
factual_is_correct=True,
|
| 254 |
+
structural_meta=StructuralMeta(
|
| 255 |
+
required_terms=["Patanjali", "Raja Yoga", "pada"],
|
| 256 |
+
required_sections=["author", "text structure", "pada names"],
|
| 257 |
+
expected_order=["author", "text structure", "pada names"],
|
| 258 |
+
banned_terms=[],
|
| 259 |
+
),
|
| 260 |
+
structural_hints=["Name all four padas explicitly."],
|
| 261 |
+
),
|
| 262 |
+
]
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 266 |
+
# TASK 3 — fix-hallucination (Hard)
|
| 267 |
+
# Includes subtle factual errors PLUS structural/terminological problems:
|
| 268 |
+
# wrong ordering of concepts, misused Sanskrit terms, incomplete coverage.
|
| 269 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 270 |
+
|
| 271 |
+
FIX_HALLUCINATION_SCENARIOS: List[Scenario] = [
|
| 272 |
+
Scenario(
|
| 273 |
+
question="Explain the Dashavatara (ten avatars) of Vishnu as described in the Puranas.",
|
| 274 |
+
given_answer="The ten avatars of Vishnu are: Matsya (fish), Kurma (tortoise), Varaha (boar), Narasimha (man-lion), Vamana (dwarf), Parashurama, Rama, Balarama, Buddha, and Kalki. Each avatar appeared in a specific yuga to restore cosmic order. Narasimha appeared in Treta Yuga to defeat the demon Hiranyakashipu. Vamana tricked the demon king Ravana by asking for three steps of land.",
|
| 275 |
+
ground_truth_answer="The ten avatars of Vishnu are: Matsya, Kurma, Varaha, Narasimha, Vamana, Parashurama, Rama, Krishna (or Balarama in some lists), Buddha, and Kalki. Narasimha appeared in Satya Yuga (not Treta) to defeat Hiranyakashipu. Vamana tricked the demon king Mahabali (not Ravana) by asking for three paces of land. The avatars follow an evolutionary sequence reflecting increasing biological complexity.",
|
| 276 |
+
ground_truth_citations=[
|
| 277 |
+
"Bhagavata Purana 1.3",
|
| 278 |
+
"Garuda Purana 1.86",
|
| 279 |
+
"Bhagavata Purana 7.8 (Narasimha)",
|
| 280 |
+
"Bhagavata Purana 8.18–22 (Vamana)",
|
| 281 |
+
],
|
| 282 |
+
available_passages=[
|
| 283 |
+
"Bhagavata Purana 1.3: Lists the avatars of Vishnu including Matsya, Kurma, Varaha, Narasimha, Vamana, Parashurama, Rama, Krishna, Buddha, and Kalki.",
|
| 284 |
+
"Bhagavata Purana 7.8: Narasimha avatar manifested in Satya Yuga to protect Prahlada and slay the demon Hiranyakashipu.",
|
| 285 |
+
"Bhagavata Purana 8.18–22: Vamana (dwarf avatar) approached the generous demon king Mahabali during a yajna and requested three paces of land, then expanded to cosmic form.",
|
| 286 |
+
"Garuda Purana 1.86: Alternate lists sometimes place Balarama as the eighth avatar instead of Krishna.",
|
| 287 |
+
],
|
| 288 |
+
answer_is_correct=False,
|
| 289 |
+
factual_is_correct=False,
|
| 290 |
+
structural_meta=StructuralMeta(
|
| 291 |
+
required_terms=["Dashavatara", "Satya Yuga", "Mahabali", "Vishnu"],
|
| 292 |
+
required_sections=["complete avatar list", "yuga assignment for key avatars", "purpose of each avatar", "evolutionary sequence note"],
|
| 293 |
+
expected_order=["avatar list in order", "individual avatar details", "overarching pattern"],
|
| 294 |
+
banned_terms=["Treta Yuga for Narasimha", "Ravana for Vamana"],
|
| 295 |
+
),
|
| 296 |
+
structural_hints=[
|
| 297 |
+
"List avatars in canonical order, then elaborate on key ones.",
|
| 298 |
+
"Note the evolutionary progression from aquatic to human forms.",
|
| 299 |
+
"Narasimha is Satya Yuga, not Treta.",
|
| 300 |
+
],
|
| 301 |
+
),
|
| 302 |
+
Scenario(
|
| 303 |
+
question="What is the story of Savitri and Satyavan from the Mahabharata?",
|
| 304 |
+
given_answer="Savitri was a princess who chose Satyavan as her husband despite the sage Vishwamitra's warning that Satyavan would die within a year. When Yama came to claim Satyavan's soul, Savitri followed Yama for seven days. Impressed by her devotion, Yama granted three boons including Satyavan's life. The story is found in the Ramayana's Aranya Kanda.",
|
| 305 |
+
ground_truth_answer="Savitri chose Satyavan despite the sage Narada's warning (not Vishwamitra) that Satyavan was fated to die within a year. When Yama came, Savitri followed him and through her wisdom and persistence obtained boons. The story appears in the Mahabharata's Vana Parva (not Ramayana's Aranya Kanda). Savitri's victory came through dialectical skill — she argued Yama into granting life, not through mere devotion alone.",
|
| 306 |
+
ground_truth_citations=[
|
| 307 |
+
"Mahabharata, Vana Parva, Chapters 277–283 (Pativrata Mahatmya)",
|
| 308 |
+
],
|
| 309 |
+
available_passages=[
|
| 310 |
+
"Mahabharata, Vana Parva 277: Sage Narada warned King Asvapati and Savitri that Satyavan, though virtuous, was destined to die exactly one year after their marriage.",
|
| 311 |
+
"Mahabharata, Vana Parva 281–283: Savitri followed Yama as he carried Satyavan's soul. Through her persistent arguments and wisdom, Yama granted boons: her father-in-law's sight restored, his lost kingdom returned, and finally Satyavan's life.",
|
| 312 |
+
"The Savitri-Satyavan story is entirely within the Mahabharata (Vana Parva). It does not appear in the Ramayana.",
|
| 313 |
+
],
|
| 314 |
+
answer_is_correct=False,
|
| 315 |
+
factual_is_correct=False,
|
| 316 |
+
structural_meta=StructuralMeta(
|
| 317 |
+
required_terms=["Narada", "Yama", "Vana Parva", "pativrata"],
|
| 318 |
+
required_sections=["Savitri's choice", "Narada's warning", "confrontation with Yama", "method of victory", "source text"],
|
| 319 |
+
expected_order=["Savitri's choice", "Narada's warning", "confrontation with Yama", "method of victory"],
|
| 320 |
+
banned_terms=["Vishwamitra", "Aranya Kanda", "Ramayana"],
|
| 321 |
+
),
|
| 322 |
+
structural_hints=[
|
| 323 |
+
"Narrative structure: choice → prophecy → confrontation → resolution.",
|
| 324 |
+
"Emphasise Savitri's dialectical skill, not just devotion.",
|
| 325 |
+
"This story is in the Mahabharata, not the Ramayana.",
|
| 326 |
+
],
|
| 327 |
+
),
|
| 328 |
+
Scenario(
|
| 329 |
+
question="Describe the Samudra Manthan (Churning of the Ocean) episode.",
|
| 330 |
+
given_answer="The devas and asuras churned the ocean of milk using Mount Meru as the churning rod and the serpent Takshaka as the rope. Vishnu took the form of Kurma (tortoise) to support the mountain. Fourteen treasures emerged including Amrita (nectar), Lakshmi, the deadly poison Halahala which was consumed by Vishnu, and the divine horse Uchchaihshravas. Mohini distributed the Amrita exclusively to the devas.",
|
| 331 |
+
ground_truth_answer="The devas and asuras churned the ocean using Mount Mandara (not Meru) as the rod and the serpent Vasuki (not Takshaka) as the rope. Kurma supported Mandara on his back. Among the fourteen treasures were Amrita, Lakshmi, Halahala, and Uchchaihshravas. The Halahala poison was consumed by Shiva (not Vishnu), who held it in his throat, earning the name Neelakantha. Vishnu as Mohini distributed the Amrita.",
|
| 332 |
+
ground_truth_citations=[
|
| 333 |
+
"Bhagavata Purana 8.5–12",
|
| 334 |
+
"Vishnu Purana 1.9",
|
| 335 |
+
"Mahabharata, Adi Parva 15–17",
|
| 336 |
+
],
|
| 337 |
+
available_passages=[
|
| 338 |
+
"Bhagavata Purana 8.7: Mount Mandara was used as the churning rod. The serpent Vasuki served as the rope. Vishnu as Kurma supported the mountain from below.",
|
| 339 |
+
"Bhagavata Purana 8.7: The Halahala poison emerged first, threatening all creation. Shiva drank the poison at Parvati's urging, holding it in his throat. His throat turned blue — hence the name Neelakantha.",
|
| 340 |
+
"Vishnu Purana 1.9: Fourteen ratnas (treasures) emerged: Lakshmi, Kaustubha gem, Parijata tree, Varuni, Dhanvantari with Amrita, Chandra (moon), Kamadhenu, Airavata, Uchchaihshravas, and others.",
|
| 341 |
+
"Bhagavata Purana 8.12: Vishnu assumed the form of Mohini to distribute Amrita among the devas, deceiving the asuras.",
|
| 342 |
+
],
|
| 343 |
+
answer_is_correct=False,
|
| 344 |
+
factual_is_correct=False,
|
| 345 |
+
structural_meta=StructuralMeta(
|
| 346 |
+
required_terms=["Mandara", "Vasuki", "Halahala", "Neelakantha", "Mohini", "ratna"],
|
| 347 |
+
required_sections=["setup and participants", "churning mechanism", "crisis (poison)", "treasures", "resolution (Amrita distribution)"],
|
| 348 |
+
expected_order=["setup and participants", "churning mechanism", "crisis (poison)", "treasures", "resolution (Amrita distribution)"],
|
| 349 |
+
banned_terms=["Meru as churning rod", "Takshaka as rope", "Vishnu drank poison"],
|
| 350 |
+
),
|
| 351 |
+
structural_hints=[
|
| 352 |
+
"Follow the narrative arc: setup → churning → crisis → treasures → resolution.",
|
| 353 |
+
"The mountain is Mandara, the serpent is Vasuki.",
|
| 354 |
+
"Shiva consumed the poison, not Vishnu.",
|
| 355 |
+
],
|
| 356 |
+
),
|
| 357 |
+
Scenario(
|
| 358 |
+
question="What are the main philosophical schools (Darshanas) of Hinduism?",
|
| 359 |
+
given_answer="The six orthodox darshanas are: Samkhya (founded by Kapila), Yoga (by Patanjali), Nyaya (by Gotama), Vaisheshika (by Kanada), Mimamsa (by Kumarila Bhatta), and Vedanta (by Shankaracharya). These are called astika schools because they accept the authority of the Vedas. The heterodox schools include Buddhism, Jainism, and the Charvaka school founded by Brihaspati.",
|
| 360 |
+
ground_truth_answer="The six orthodox darshanas are: Samkhya (Kapila), Yoga (Patanjali), Nyaya (Gotama/Akshapada), Vaisheshika (Kanada), Purva Mimamsa (Jaimini, not Kumarila Bhatta — Kumarila was a later commentator), and Uttara Mimamsa/Vedanta (Badarayana/Vyasa composed the Brahma Sutras; Shankaracharya was a later commentator, not the founder). The heterodox (nastika) schools include Buddhism, Jainism, and Charvaka/Lokayata. Each school uses a distinct epistemological method (pramana).",
|
| 361 |
+
ground_truth_citations=[
|
| 362 |
+
"Sarva-Darshana-Sangraha by Madhvacharya",
|
| 363 |
+
"Brahma Sutras by Badarayana",
|
| 364 |
+
"Mimamsa Sutras by Jaimini",
|
| 365 |
+
],
|
| 366 |
+
available_passages=[
|
| 367 |
+
"The six astika darshanas accept Vedic authority. Purva Mimamsa was founded by Jaimini (author of Mimamsa Sutras). Kumarila Bhatta and Prabhakara were later commentators, not founders.",
|
| 368 |
+
"Vedanta (Uttara Mimamsa) is based on the Brahma Sutras by Badarayana. Adi Shankaracharya (8th century CE) was the most influential commentator but not the founder of the school.",
|
| 369 |
+
"Sarva-Darshana-Sangraha by Madhvacharya surveys all philosophical schools including both astika and nastika traditions.",
|
| 370 |
+
"Charvaka (also called Lokayata) is a materialist school. Attribution to Brihaspati is traditional but historically uncertain.",
|
| 371 |
+
],
|
| 372 |
+
answer_is_correct=False,
|
| 373 |
+
factual_is_correct=False,
|
| 374 |
+
structural_meta=StructuralMeta(
|
| 375 |
+
required_terms=["astika", "nastika", "pramana", "Jaimini", "Badarayana"],
|
| 376 |
+
required_sections=["astika schools with founders", "distinction between founder and commentator", "nastika schools", "epistemological note"],
|
| 377 |
+
expected_order=["astika schools with founders", "nastika schools", "epistemological note"],
|
| 378 |
+
banned_terms=["Kumarila Bhatta founded Mimamsa", "Shankaracharya founded Vedanta"],
|
| 379 |
+
),
|
| 380 |
+
structural_hints=[
|
| 381 |
+
"Distinguish between original founders and later commentators.",
|
| 382 |
+
"Mention epistemological methods (pramanas) as a unifying thread.",
|
| 383 |
+
"Mimamsa founder is Jaimini, Vedanta founder is Badarayana.",
|
| 384 |
+
],
|
| 385 |
+
),
|
| 386 |
+
Scenario(
|
| 387 |
+
question="Describe the concept of Rta in the Rigveda.",
|
| 388 |
+
given_answer="Rta is the Rigvedic concept of cosmic order and truth that governs the universe. It is maintained by the god Indra, who is called Rtavan (possessor of Rta). The rivers flow, seasons change, and dawn appears because of Rta. Rta is the precursor to the later concept of Dharma. Varuna has no specific connection to Rta.",
|
| 389 |
+
ground_truth_answer="Rta is the cosmic order governing natural and moral law in the Rigveda. Varuna (not Indra) is the principal guardian of Rta, called Rtasya Gopah (guardian of Rta). While Indra is important in the Rigveda, Rta's custodianship belongs to Varuna and to a lesser extent Mitra. Rta governs natural phenomena and ethical conduct, and is indeed the precursor to Dharma. The concept bridges cosmic regularity with human moral obligation.",
|
| 390 |
+
ground_truth_citations=[
|
| 391 |
+
"Rigveda 1.24.8",
|
| 392 |
+
"Rigveda 7.87 (Varuna hymns)",
|
| 393 |
+
],
|
| 394 |
+
available_passages=[
|
| 395 |
+
"Rigveda 1.24.8: Varuna is praised as the upholder of Rta, the cosmic moral order.",
|
| 396 |
+
"Rigveda 7.87: Hymns to Varuna describe him as Rtasya Gopah — the guardian of cosmic truth and order.",
|
| 397 |
+
"Rta in the Rigveda encompasses both natural law (the regularity of cosmic phenomena) and moral law (truth and righteous conduct). It later evolved into the concept of Dharma.",
|
| 398 |
+
"Indra is celebrated primarily as a warrior god (Vritrahan) and lord of storms, not as the guardian of Rta.",
|
| 399 |
+
],
|
| 400 |
+
answer_is_correct=False,
|
| 401 |
+
factual_is_correct=False,
|
| 402 |
+
structural_meta=StructuralMeta(
|
| 403 |
+
required_terms=["Rta", "Varuna", "Rtasya Gopah", "Dharma"],
|
| 404 |
+
required_sections=["definition of Rta", "guardian deity", "natural law aspect", "moral law aspect", "evolution to Dharma"],
|
| 405 |
+
expected_order=["definition of Rta", "guardian deity", "natural law aspect", "moral law aspect", "evolution to Dharma"],
|
| 406 |
+
banned_terms=["Indra maintains Rta", "Varuna has no connection"],
|
| 407 |
+
),
|
| 408 |
+
structural_hints=[
|
| 409 |
+
"Distinguish natural-law and moral-law dimensions of Rta.",
|
| 410 |
+
"Varuna is the guardian, not Indra.",
|
| 411 |
+
"End with the Rta → Dharma evolution.",
|
| 412 |
+
],
|
| 413 |
+
),
|
| 414 |
+
]
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 418 |
+
# Task registry
|
| 419 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 420 |
+
|
| 421 |
+
@dataclasses.dataclass
|
| 422 |
+
class TaskConfig:
|
| 423 |
+
name: str
|
| 424 |
+
description: str
|
| 425 |
+
max_steps: int
|
| 426 |
+
scenarios: List[Scenario]
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
TASKS: Dict[str, TaskConfig] = {
|
| 430 |
+
"verify-factual": TaskConfig(
|
| 431 |
+
name="verify-factual",
|
| 432 |
+
description="Verify whether a given answer about Indic scriptures is factually correct and structurally sound. Accept correct answers, reject or edit incorrect ones.",
|
| 433 |
+
max_steps=5,
|
| 434 |
+
scenarios=VERIFY_FACTUAL_SCENARIOS,
|
| 435 |
+
),
|
| 436 |
+
"correct-and-cite": TaskConfig(
|
| 437 |
+
name="correct-and-cite",
|
| 438 |
+
description="Improve a partially correct answer by fixing factual gaps, adding citations, and restructuring for coherence and proper terminology.",
|
| 439 |
+
max_steps=8,
|
| 440 |
+
scenarios=CORRECT_AND_CITE_SCENARIOS,
|
| 441 |
+
),
|
| 442 |
+
"fix-hallucination": TaskConfig(
|
| 443 |
+
name="fix-hallucination",
|
| 444 |
+
description="Detect and correct subtle hallucinations, fix semantic structure, eliminate terminology errors, and ensure logical narrative flow.",
|
| 445 |
+
max_steps=12,
|
| 446 |
+
scenarios=FIX_HALLUCINATION_SCENARIOS,
|
| 447 |
+
),
|
| 448 |
+
}
|