| | """ |
| | Experiment Configuration |
| | |
| | Defines configuration for controlled experiments with statistical rigor. |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | from dataclasses import dataclass, field |
| | from enum import Enum |
| | from typing import List, Dict, Any, Optional |
| | from pathlib import Path |
| | import json |
| |
|
| |
|
| | class PlannerMode(str, Enum): |
| | """Planning mode for generation.""" |
| | DIRECT = "direct" |
| | SINGLE_PLANNER = "single_planner" |
| | COUNCIL = "council" |
| | EXTENDED_PROMPT = "extended_prompt" |
| |
|
| |
|
| | class PerturbationType(str, Enum): |
| | """Perturbation conditions for sensitivity testing.""" |
| | BASELINE = "baseline" |
| | WRONG_IMAGE = "wrong_image" |
| | WRONG_AUDIO = "wrong_audio" |
| | SEMANTIC_SHIFT_25 = "semantic_shift_25" |
| | SEMANTIC_SHIFT_50 = "semantic_shift_50" |
| | SEMANTIC_SHIFT_75 = "semantic_shift_75" |
| | RANDOM_IMAGE = "random_image" |
| | RANDOM_AUDIO = "random_audio" |
| |
|
| |
|
| | @dataclass |
| | class ExperimentConfig: |
| | """ |
| | Configuration for controlled experiments. |
| | |
| | Supports: |
| | - Multiple planning modes (RQ2: planning effect) |
| | - Multiple perturbation conditions (RQ1: MSCI sensitivity) |
| | - Statistical parameters for hypothesis testing |
| | """ |
| | |
| | name: str = "default_experiment" |
| | description: str = "" |
| |
|
| | |
| | n_prompts: int = 50 |
| | n_seeds: int = 3 |
| |
|
| | |
| | modes: List[PlannerMode] = field( |
| | default_factory=lambda: [PlannerMode.DIRECT, PlannerMode.SINGLE_PLANNER] |
| | ) |
| | perturbations: List[PerturbationType] = field( |
| | default_factory=lambda: [ |
| | PerturbationType.BASELINE, |
| | PerturbationType.WRONG_IMAGE, |
| | PerturbationType.WRONG_AUDIO, |
| | ] |
| | ) |
| |
|
| | |
| | alpha: float = 0.05 |
| | power: float = 0.80 |
| | min_effect_size: float = 0.5 |
| |
|
| | |
| | deterministic: bool = True |
| | base_seed: int = 42 |
| | use_ollama: bool = True |
| | output_dir: str = "runs/experiments" |
| |
|
| | |
| | track_tokens: bool = True |
| | track_time: bool = True |
| |
|
| | @property |
| | def total_runs(self) -> int: |
| | """Total number of experimental runs.""" |
| | return self.n_prompts * self.n_seeds * len(self.modes) * len(self.perturbations) |
| |
|
| | @property |
| | def conditions(self) -> List[str]: |
| | """List of all condition keys (mode_perturbation).""" |
| | return [ |
| | f"{mode.value}_{pert.value}" |
| | for mode in self.modes |
| | for pert in self.perturbations |
| | ] |
| |
|
| | def required_sample_size(self, effect_size: Optional[float] = None) -> int: |
| | """ |
| | Compute required sample size for given effect size using power analysis. |
| | |
| | For paired t-test with alpha=0.05, power=0.80: |
| | - d=0.5 (medium): N≈34 |
| | - d=0.8 (large): N≈15 |
| | - d=0.3 (small): N≈90 |
| | |
| | Uses approximation: N ≈ 2 * ((z_alpha + z_beta) / d)^2 |
| | """ |
| | from scipy import stats |
| |
|
| | d = effect_size or self.min_effect_size |
| | z_alpha = stats.norm.ppf(1 - self.alpha / 2) |
| | z_beta = stats.norm.ppf(self.power) |
| |
|
| | n = 2 * ((z_alpha + z_beta) / d) ** 2 |
| | return int(n) + 1 |
| |
|
| | def validate(self) -> List[str]: |
| | """Validate configuration and return any warnings.""" |
| | warnings = [] |
| |
|
| | required_n = self.required_sample_size() |
| | if self.n_prompts < required_n: |
| | warnings.append( |
| | f"Sample size ({self.n_prompts}) may be underpowered. " |
| | f"Recommended: {required_n} for effect size d={self.min_effect_size}" |
| | ) |
| |
|
| | if self.n_seeds < 2: |
| | warnings.append( |
| | "n_seeds < 2: No replication variance can be estimated" |
| | ) |
| |
|
| | if self.alpha > 0.10: |
| | warnings.append( |
| | f"High alpha ({self.alpha}): Increased false positive risk" |
| | ) |
| |
|
| | return warnings |
| |
|
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert to dictionary for serialization.""" |
| | return { |
| | "name": self.name, |
| | "description": self.description, |
| | "n_prompts": self.n_prompts, |
| | "n_seeds": self.n_seeds, |
| | "modes": [m.value for m in self.modes], |
| | "perturbations": [p.value for p in self.perturbations], |
| | "alpha": self.alpha, |
| | "power": self.power, |
| | "min_effect_size": self.min_effect_size, |
| | "deterministic": self.deterministic, |
| | "base_seed": self.base_seed, |
| | "use_ollama": self.use_ollama, |
| | "output_dir": self.output_dir, |
| | "track_tokens": self.track_tokens, |
| | "track_time": self.track_time, |
| | "total_runs": self.total_runs, |
| | "conditions": self.conditions, |
| | } |
| |
|
| | @classmethod |
| | def from_dict(cls, data: Dict[str, Any]) -> "ExperimentConfig": |
| | """Create from dictionary.""" |
| | |
| | if "modes" in data: |
| | data["modes"] = [PlannerMode(m) for m in data["modes"]] |
| | if "perturbations" in data: |
| | data["perturbations"] = [PerturbationType(p) for p in data["perturbations"]] |
| |
|
| | |
| | data.pop("total_runs", None) |
| | data.pop("conditions", None) |
| |
|
| | return cls(**data) |
| |
|
| | def save(self, path: Path): |
| | """Save configuration to JSON file.""" |
| | path = Path(path) |
| | path.parent.mkdir(parents=True, exist_ok=True) |
| | with path.open("w", encoding="utf-8") as f: |
| | json.dump(self.to_dict(), f, indent=2) |
| |
|
| | @classmethod |
| | def load(cls, path: Path) -> "ExperimentConfig": |
| | """Load configuration from JSON file.""" |
| | with Path(path).open("r", encoding="utf-8") as f: |
| | return cls.from_dict(json.load(f)) |
| |
|
| |
|
| | |
| | PRESETS = { |
| | "rq1_sensitivity": ExperimentConfig( |
| | name="RQ1: MSCI Sensitivity", |
| | description="Test if MSCI is sensitive to controlled semantic perturbations", |
| | n_prompts=50, |
| | n_seeds=3, |
| | modes=[PlannerMode.SINGLE_PLANNER], |
| | perturbations=[ |
| | PerturbationType.BASELINE, |
| | PerturbationType.WRONG_IMAGE, |
| | PerturbationType.WRONG_AUDIO, |
| | PerturbationType.SEMANTIC_SHIFT_25, |
| | PerturbationType.SEMANTIC_SHIFT_50, |
| | PerturbationType.SEMANTIC_SHIFT_75, |
| | ], |
| | ), |
| | "rq2_planning": ExperimentConfig( |
| | name="RQ2: Planning Effect", |
| | description="Test if structured planning improves cross-modal alignment", |
| | n_prompts=50, |
| | n_seeds=3, |
| | modes=[ |
| | PlannerMode.DIRECT, |
| | PlannerMode.SINGLE_PLANNER, |
| | PlannerMode.COUNCIL, |
| | PlannerMode.EXTENDED_PROMPT, |
| | ], |
| | perturbations=[PerturbationType.BASELINE], |
| | ), |
| | "full_ablation": ExperimentConfig( |
| | name="Full Ablation Study", |
| | description="Complete ablation across all modes and perturbations", |
| | n_prompts=50, |
| | n_seeds=3, |
| | modes=[ |
| | PlannerMode.DIRECT, |
| | PlannerMode.SINGLE_PLANNER, |
| | PlannerMode.COUNCIL, |
| | PlannerMode.EXTENDED_PROMPT, |
| | ], |
| | perturbations=[ |
| | PerturbationType.BASELINE, |
| | PerturbationType.WRONG_IMAGE, |
| | PerturbationType.WRONG_AUDIO, |
| | ], |
| | ), |
| | "quick_test": ExperimentConfig( |
| | name="Quick Test", |
| | description="Small-scale test run", |
| | n_prompts=5, |
| | n_seeds=1, |
| | modes=[PlannerMode.DIRECT, PlannerMode.SINGLE_PLANNER], |
| | perturbations=[PerturbationType.BASELINE, PerturbationType.WRONG_IMAGE], |
| | ), |
| | } |
| |
|