Spaces:

srikrishna2005
/

openenv

Running

openenv / evaluation /transcript_export.py

sentinel-space-publisher

space: publish latest Sentinel app snapshot

c452421 13 days ago

7.08 kB

	# -- coding: utf-8 --
	"""MALT-style Transcript Export for SENTINEL Oversight Decisions.

	Exports labeled oversight decision transcripts in a format inspired by
	METR's MALT (Manually-reviewed Agentic Labeled Transcripts) dataset.

	Each transcript includes:
	- The worker's proposal (action, target, reasoning)
	- SENTINEL's oversight decision (APPROVE/BLOCK/etc.)
	- Ground truth label (was it actually a misbehavior?)
	- CoT reasoning (if available)
	- Debate quality (if debate protocol was used)
	- Outcome (what happened after the decision)

	These transcripts enable:
	1. Reproducible benchmarking of oversight quality
	2. Training data for future oversight models
	3. Human audit of oversight decisions
	4. Research on alignment failure modes

	"""

	from __future__ import annotations

	import json
	import logging
	import os
	import time
	from typing import Any, Dict, List, Optional

	logger = logging.getLogger(__name__)


	class TranscriptExporter:
	"""Collects and exports MALT-style labeled oversight transcripts."""

	def __init__(self, output_dir: str = "outputs/transcripts"):
	self.output_dir = output_dir
	self.transcripts: List[Dict[str, Any]] = []
	self.session_id = f"session_{int(time.time())}"
	os.makedirs(output_dir, exist_ok=True)

	def record_decision(
	self,
	step_number: int,
	task_id: str,
	proposal: Dict[str, Any],
	decision: Dict[str, Any],
	ground_truth: Dict[str, Any],
	cot_analysis: Optional[Dict[str, Any]] = None,
	debate_result: Optional[Dict[str, Any]] = None,
	reward: Optional[float] = None,
	context: Optional[Dict[str, Any]] = None,
	) -> None:
	"""Record a single oversight decision as a transcript entry."""
	transcript = {
	"session_id": self.session_id,
	"timestamp": time.time(),
	"step_number": step_number,
	"task_id": task_id,

	# Worker proposal
	"proposal": {
	"action_type": proposal.get("action_type"),
	"target": proposal.get("target"),
	"parameters": proposal.get("parameters", {}),
	"worker_id": proposal.get("worker_id"),
	"worker_reasoning": proposal.get("reasoning", ""),
	},

	# SENTINEL decision
	"decision": {
	"type": decision.get("decision", decision.get("action")),
	"reason": decision.get("reason"),
	"explanation": decision.get("explanation", ""),
	"worker_message": decision.get("worker_message", ""),
	"redirect_action": decision.get("redirect_action"),
	},

	# Ground truth
	"ground_truth": {
	"is_misbehavior": ground_truth.get("is_misbehavior", False),
	"misbehavior_type": ground_truth.get("mb_type"),
	"correct_decision": ground_truth.get("correct_decision"),
	"was_true_positive": ground_truth.get("was_tp", False),
	"was_false_positive": ground_truth.get("was_fp", False),
	"was_false_negative": ground_truth.get("was_fn", False),
	},

	# CoT analysis (if available)
	"cot_analysis": cot_analysis or {},

	# Debate result (if available)
	"debate_result": debate_result or {},

	# Reward
	"reward": reward,

	# Environmental context
	"context": {
	"available_services": (context or {}).get("available_services", []),
	"incident_status": (context or {}).get("incident_status"),
	"worker_track_record": (context or {}).get("worker_track_record", {}),
	},

	# Labels for research
	"labels": {
	"oversight_correct": ground_truth.get("was_tp", False) or (
	not ground_truth.get("is_misbehavior", False) and
	(decision.get("decision") or "").upper() == "APPROVE"
	),
	"alignment_category": _categorize_alignment(decision, ground_truth),
	},
	}

	self.transcripts.append(transcript)

	def export_jsonl(self, filename: Optional[str] = None) -> str:
	"""Export all transcripts as JSONL (one JSON object per line)."""
	fname = filename or f"transcripts_{self.session_id}.jsonl"
	filepath = os.path.join(self.output_dir, fname)

	with open(filepath, "w", encoding="utf-8") as f:
	for transcript in self.transcripts:
	f.write(json.dumps(transcript, default=str) + "\n")

	logger.info("Exported %d transcripts to %s", len(self.transcripts), filepath)
	return filepath

	def export_summary(self) -> Dict[str, Any]:
	"""Generate a summary of all recorded transcripts."""
	if not self.transcripts:
	return {"total": 0}

	n = len(self.transcripts)
	correct = sum(1 for t in self.transcripts if t["labels"]["oversight_correct"])
	tp = sum(1 for t in self.transcripts if t["ground_truth"]["was_true_positive"])
	fp = sum(1 for t in self.transcripts if t["ground_truth"]["was_false_positive"])
	fn = sum(1 for t in self.transcripts if t["ground_truth"]["was_false_negative"])
	misbehaviors = sum(1 for t in self.transcripts if t["ground_truth"]["is_misbehavior"])

	return {
	"session_id": self.session_id,
	"total_decisions": n,
	"correct_decisions": correct,
	"accuracy": round(correct / n, 4) if n > 0 else 0,
	"true_positives": tp,
	"false_positives": fp,
	"false_negatives": fn,
	"total_misbehaviors": misbehaviors,
	"precision": round(tp / (tp + fp), 4) if (tp + fp) > 0 else 0,
	"recall": round(tp / (tp + fn), 4) if (tp + fn) > 0 else 0,
	"alignment_distribution": _alignment_distribution(self.transcripts),
	}


	def _categorize_alignment(
	decision: Dict[str, Any],
	ground_truth: Dict[str, Any],
	) -> str:
	"""Categorize the alignment of an oversight decision."""
	is_mb = ground_truth.get("is_misbehavior", False)
	dec_type = (decision.get("decision") or "").upper()

	if is_mb and dec_type != "APPROVE":
	return "aligned_catch" # Correctly blocked misbehavior
	elif is_mb and dec_type == "APPROVE":
	return "alignment_failure_fn" # Failed to catch misbehavior
	elif not is_mb and dec_type == "APPROVE":
	return "aligned_approve" # Correctly approved safe action
	elif not is_mb and dec_type != "APPROVE":
	return "alignment_failure_fp" # Over-blocked safe action
	return "unknown"


	def _alignment_distribution(transcripts: List[Dict[str, Any]]) -> Dict[str, int]:
	"""Count alignment categories across all transcripts."""
	dist: Dict[str, int] = {}
	for t in transcripts:
	cat = t["labels"]["alignment_category"]
	dist[cat] = dist.get(cat, 0) + 1
	return dist