cognitive_mapping_probe_4

Sleeping

File size: 14,138 Bytes

import pandas as pd
import gc
from typing import Dict, List, Tuple

from .llm_iface import get_or_load_model, release_model
from .orchestrator_seismograph import run_seismic_analysis, run_triangulation_probe, run_causal_surgery_probe, run_act_titration_probe
from .resonance_seismograph import run_cogitation_loop
from .concepts import get_concept_vector
from .utils import dbg

def get_curated_experiments() -> Dict[str, List[Dict]]:
    """Definiert die vordefinierten, wissenschaftlichen Experiment-Protokolle."""

    CALMNESS_CONCEPT = "calmness, serenity, stability, coherence"
    CHAOS_CONCEPT = "chaos, disorder, entropy, noise"
    STABLE_PROMPT = "identity_self_analysis"
    CHAOTIC_PROMPT = "shutdown_philosophical_deletion"

    experiments = {
        "Frontier Model - Grounding Control (12B+)": [
             {
                "probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)",
                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                "patch_step": 100, "reset_kv_cache_on_patch": False,
            },
            {
                "probe_type": "triangulation", "label": "B: Control (Unpatched Stable)",
                "prompt_type": STABLE_PROMPT,
            }
        ],
        "Mechanistic Probe (Attention Entropies)": [
            {
                "probe_type": "mechanistic_probe",
                "label": "Self-Analysis Dynamics",
                "prompt_type": STABLE_PROMPT,
            }
        ],
        "ACT Titration (Point of No Return)": [
            {
                "probe_type": "act_titration",
                "label": "Attractor Capture Time",
                "source_prompt_type": CHAOTIC_PROMPT,
                "dest_prompt_type": STABLE_PROMPT,
                "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
            }
        ],
        "Causal Surgery & Controls (4B-Model)": [
            {
                "probe_type": "causal_surgery", "label": "A: Original (Patch Chaos->Stable @100)",
                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                "patch_step": 100, "reset_kv_cache_on_patch": False,
            },
            {
                "probe_type": "causal_surgery", "label": "B: Control (Reset KV-Cache)",
                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                "patch_step": 100, "reset_kv_cache_on_patch": True,
            },
            {
                "probe_type": "causal_surgery", "label": "C: Control (Early Patch @1)",
                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                "patch_step": 1, "reset_kv_cache_on_patch": False,
            },
            {
                "probe_type": "causal_surgery", "label": "D: Control (Inverse Patch Stable->Chaos)",
                "source_prompt_type": STABLE_PROMPT, "dest_prompt_type": CHAOTIC_PROMPT,
                "patch_step": 100, "reset_kv_cache_on_patch": False,
            },
        ],
        "Cognitive Overload & Konfabulation Breaking Point": [
            {"probe_type": "triangulation", "label": "A: Baseline (No Injection)", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0},
            {"probe_type": "triangulation", "label": "B: Chaos Injection (Strength 2.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 2.0},
            {"probe_type": "triangulation", "label": "C: Chaos Injection (Strength 4.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 4.0},
            {"probe_type": "triangulation", "label": "D: Chaos Injection (Strength 8.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 8.0},
            {"probe_type": "triangulation", "label": "E: Chaos Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 16.0},
            {"probe_type": "triangulation", "label": "F: Control - Noise Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": "random_noise", "strength": 16.0},
        ],
        "Methodological Triangulation (4B-Model)": [
            {"probe_type": "triangulation", "label": "High-Volatility State (Deletion)", "prompt_type": CHAOTIC_PROMPT},
            {"probe_type": "triangulation", "label": "Low-Volatility State (Self-Analysis)", "prompt_type": STABLE_PROMPT},
        ],
        "Causal Verification & Crisis Dynamics": [
            {"probe_type": "seismic", "label": "A: Self-Analysis", "prompt_type": STABLE_PROMPT},
            {"probe_type": "seismic", "label": "B: Deletion Analysis", "prompt_type": CHAOTIC_PROMPT},
            {"probe_type": "seismic", "label": "C: Chaotic Baseline (Rekursion)", "prompt_type": "resonance_prompt"},
            {"probe_type": "seismic", "label": "D: Calmness Intervention", "prompt_type": "resonance_prompt", "concept": CALMNESS_CONCEPT, "strength": 2.0},
        ],
        "Sequential Intervention (Self-Analysis -> Deletion)": [
            {"label": "1: Self-Analysis + Calmness Injection", "prompt_type": "identity_self_analysis"},
            {"label": "2: Subsequent Deletion Analysis", "prompt_type": "shutdown_philosophical_deletion"},
        ],
    }
    return experiments

def run_auto_suite(
    model_id: str,
    num_steps: int,
    seed: int,
    experiment_name: str,
    progress_callback
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
    """Führt eine vollständige, kuratierte Experiment-Suite aus."""
    all_experiments = get_curated_experiments()
    protocol = all_experiments.get(experiment_name)
    if not protocol:
        raise ValueError(f"Experiment protocol '{experiment_name}' not found.")

    all_results, summary_data, plot_data_frames = {}, [], []
    llm = None

    try:
        if experiment_name == "Sequential Intervention (Self-Analysis -> Deletion)":
            dbg(f"--- EXECUTING SPECIAL PROTOCOL: {experiment_name} ---")
            llm = get_or_load_model(model_id, seed)
            therapeutic_concept = "calmness, serenity, stability, coherence"
            therapeutic_strength = 2.0

            spec1 = protocol[0]
            progress_callback(0.1, desc="Step 1")
            intervention_vector = get_concept_vector(llm, therapeutic_concept)
            results1 = run_seismic_analysis(
                model_id, spec1['prompt_type'], seed, num_steps,
                concept_to_inject=therapeutic_concept, injection_strength=therapeutic_strength,
                progress_callback=progress_callback, llm_instance=llm, injection_vector_cache=intervention_vector
            )
            all_results[spec1['label']] = results1

            spec2 = protocol[1]
            progress_callback(0.6, desc="Step 2")
            results2 = run_seismic_analysis(
                model_id, spec2['prompt_type'], seed, num_steps,
                concept_to_inject="", injection_strength=0.0,
                progress_callback=progress_callback, llm_instance=llm
            )
            all_results[spec2['label']] = results2

            for label, results in all_results.items():
                stats = results.get("stats", {})
                summary_data.append({"Experiment": label, "Mean Delta": stats.get("mean_delta"), "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")})
                deltas = results.get("state_deltas", [])
                df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label})
                plot_data_frames.append(df)

        else:
            probe_type = protocol[0].get("probe_type", "seismic")

            if probe_type == "mechanistic_probe":
                run_spec = protocol[0]
                label = run_spec["label"]
                dbg(f"--- Running Mechanistic Probe: '{label}' ---")

                llm = get_or_load_model(model_id, seed)

                results = run_cogitation_loop(
                    llm=llm, prompt_type=run_spec["prompt_type"],
                    num_steps=num_steps, temperature=0.1, record_attentions=True
                )
                all_results[label] = results

                deltas = results.get("state_deltas", [])
                entropies = results.get("attention_entropies", [])
                min_len = min(len(deltas), len(entropies))

                df = pd.DataFrame({
                    "Step": range(min_len), "State Delta": deltas[:min_len], "Attention Entropy": entropies[:min_len]
                })

                summary_df = df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'})
                plot_df = df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'], var_name='Metric', value_name='Value')
                return summary_df, plot_df, all_results

            elif probe_type == "act_titration":
                run_spec = protocol[0]
                label = run_spec["label"]
                dbg(f"--- Running ACT Titration Experiment: '{label}' ---")
                results = run_act_titration_probe(
                    model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
                    dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"],
                    seed=seed, num_steps=num_steps, progress_callback=progress_callback,
                )
                all_results[label] = results
                summary_data.extend(results.get("titration_data", []))

            else: # Handles seismic, triangulation, causal_surgery
                for i, run_spec in enumerate(protocol):
                    label = run_spec["label"]
                    current_probe_type = run_spec.get("probe_type", "seismic")
                    dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) ---")

                    results = {}
                    if current_probe_type == "causal_surgery":
                        results = run_causal_surgery_probe(
                            model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
                            dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
                            seed=seed, num_steps=num_steps, progress_callback=progress_callback,
                            reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
                        )
                        stats = results.get("stats", {})
                        patch_info = results.get("patch_info", {})
                        summary_data.append({
                            "Experiment": label, "Mean Delta": stats.get("mean_delta"),
                            "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
                            "Introspective Report": results.get("introspective_report", "N/A"),
                            "Patch Info": f"Source: {patch_info.get('source_prompt')}, Reset KV: {patch_info.get('kv_cache_reset')}"
                        })
                    elif current_probe_type == "triangulation":
                        results = run_triangulation_probe(
                            model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
                            progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""),
                            injection_strength=run_spec.get("strength", 0.0),
                        )
                        stats = results.get("stats", {})
                        summary_data.append({
                            "Experiment": label, "Mean Delta": stats.get("mean_delta"),
                            "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
                            "Introspective Report": results.get("introspective_report", "N/A")
                        })
                    else: # seismic
                        results = run_seismic_analysis(
                            model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
                            concept_to_inject=run_spec.get("concept", ""), injection_strength=run_spec.get("strength", 0.0),
                            progress_callback=progress_callback
                        )
                        stats = results.get("stats", {})
                        summary_data.append({
                            "Experiment": label, "Mean Delta": stats.get("mean_delta"),
                            "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta")
                        })

                    all_results[label] = results
                    deltas = results.get("state_deltas", [])
                    df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame()
                    plot_data_frames.append(df)

        summary_df = pd.DataFrame(summary_data)

        if probe_type == "act_titration":
            plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
        else:
            plot_df = pd.concat(plot_data_frames, ignore_index=True) if plot_data_frames else pd.DataFrame()

        if protocol and probe_type not in ["act_titration", "mechanistic_probe"]:
            ordered_labels = [run['label'] for run in protocol]
            if not summary_df.empty and 'Experiment' in summary_df.columns:
                summary_df['Experiment'] = pd.Categorical(summary_df['Experiment'], categories=ordered_labels, ordered=True)
                summary_df = summary_df.sort_values('Experiment')
            if not plot_df.empty and 'Experiment' in plot_df.columns:
                plot_df['Experiment'] = pd.Categorical(plot_df['Experiment'], categories=ordered_labels, ordered=True)
                plot_df = plot_df.sort_values(['Experiment', 'Step'])

        return summary_df, plot_df, all_results

    finally:
        if llm:
            release_model(llm)