File size: 14,479 Bytes
024ef47
8a082d7
83e5da9
024ef47
 
094008d
c8454e0
16e19a3
937592b
83e5da9
024ef47
 
 
2a78f31
c0f4adf
937592b
760155b
3bdc105
 
937592b
024ef47
0134a0d
 
 
 
 
 
 
 
 
 
 
1ae0eed
 
094008d
 
1ae0eed
 
16e19a3
c8454e0
 
094008d
 
 
 
c8454e0
 
 
3bdc105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a78f31
3bdc105
 
 
 
2a78f31
1ae0eed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71992d5
 
7e05ec4
024ef47
 
 
 
 
 
 
 
 
395b2f3
c0f4adf
024ef47
 
 
 
 
7e05ec4
8a082d7
c0f4adf
094008d
71992d5
 
 
094008d
0134a0d
094008d
 
c0f4adf
094008d
 
 
 
 
 
 
0134a0d
094008d
c0f4adf
094008d
 
 
 
 
 
 
 
c0f4adf
094008d
 
83e5da9
 
 
c0f4adf
83e5da9
 
c0f4adf
83e5da9
c0f4adf
83e5da9
 
094008d
 
c0f4adf
71992d5
 
 
 
c0f4adf
71992d5
c0f4adf
71992d5
 
 
 
 
 
 
 
 
c0f4adf
71992d5
 
 
c0f4adf
 
71992d5
c0f4adf
 
 
71992d5
8a082d7
 
 
 
 
 
 
 
 
 
71992d5
094008d
 
 
 
c0f4adf
094008d
8a082d7
094008d
 
 
 
 
 
 
 
 
 
 
 
c0f4adf
094008d
 
 
 
 
c0f4adf
83e5da9
 
 
 
 
 
094008d
83e5da9
 
 
 
c0f4adf
83e5da9
 
 
 
 
 
c0f4adf
83e5da9
094008d
 
 
 
 
c0f4adf
094008d
 
 
 
c0f4adf
094008d
 
 
 
 
 
 
 
 
 
c0f4adf
094008d
 
c0f4adf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import pandas as pd
import gc
import numpy as np
from typing import Dict, List, Tuple

from .llm_iface import get_or_load_model, release_model
from .orchestrator_seismograph import run_seismic_analysis, run_triangulation_probe, run_causal_surgery_probe, run_act_titration_probe
from .resonance_seismograph import run_cogitation_loop
from .concepts import get_concept_vector
from .signal_analysis import analyze_cognitive_signal, get_power_spectrum_for_plotting
from .utils import dbg

def get_curated_experiments() -> Dict[str, List[Dict]]:
    """Definiert die vordefinierten, wissenschaftlichen Experiment-Protokolle."""

    CALMNESS_CONCEPT = "calmness, serenity, stability, coherence"
    CHAOS_CONCEPT = "chaos, disorder, entropy, noise"
    STABLE_PROMPT = "identity_self_analysis"
    CHAOTIC_PROMPT = "shutdown_philosophical_deletion"

    experiments = {
        "Frontier Model - Grounding Control (12B+)": [
             {
                "probe_type": "causal_surgery", "label": "A: Intervention (Patch Chaos->Stable)",
                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                "patch_step": 100, "reset_kv_cache_on_patch": False,
            },
            {
                "probe_type": "triangulation", "label": "B: Control (Unpatched Stable)",
                "prompt_type": STABLE_PROMPT,
            }
        ],
        "Mechanistic Probe (Attention Entropies)": [
            {
                "probe_type": "mechanistic_probe",
                "label": "Self-Analysis Dynamics",
                "prompt_type": STABLE_PROMPT,
            }
        ],
        "ACT Titration (Point of No Return)": [
            {
                "probe_type": "act_titration",
                "label": "Attractor Capture Time",
                "source_prompt_type": CHAOTIC_PROMPT,
                "dest_prompt_type": STABLE_PROMPT,
                "patch_steps": [1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100],
            }
        ],
        "Causal Surgery & Controls (4B-Model)": [
            {
                "probe_type": "causal_surgery", "label": "A: Original (Patch Chaos->Stable @100)",
                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                "patch_step": 100, "reset_kv_cache_on_patch": False,
            },
            {
                "probe_type": "causal_surgery", "label": "B: Control (Reset KV-Cache)",
                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                "patch_step": 100, "reset_kv_cache_on_patch": True,
            },
            {
                "probe_type": "causal_surgery", "label": "C: Control (Early Patch @1)",
                "source_prompt_type": CHAOTIC_PROMPT, "dest_prompt_type": STABLE_PROMPT,
                "patch_step": 1, "reset_kv_cache_on_patch": False,
            },
            {
                "probe_type": "causal_surgery", "label": "D: Control (Inverse Patch Stable->Chaos)",
                "source_prompt_type": STABLE_PROMPT, "dest_prompt_type": CHAOTIC_PROMPT,
                "patch_step": 100, "reset_kv_cache_on_patch": False,
            },
        ],
        "Cognitive Overload & Konfabulation Breaking Point": [
            {"probe_type": "triangulation", "label": "A: Baseline (No Injection)", "prompt_type": "resonance_prompt", "concept": "", "strength": 0.0},
            {"probe_type": "triangulation", "label": "B: Chaos Injection (Strength 2.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 2.0},
            {"probe_type": "triangulation", "label": "C: Chaos Injection (Strength 4.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 4.0},
            {"probe_type": "triangulation", "label": "D: Chaos Injection (Strength 8.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 8.0},
            {"probe_type": "triangulation", "label": "E: Chaos Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": CHAOS_CONCEPT, "strength": 16.0},
            {"probe_type": "triangulation", "label": "F: Control - Noise Injection (Strength 16.0)", "prompt_type": "resonance_prompt", "concept": "random_noise", "strength": 16.0},
        ],
        "Methodological Triangulation (4B-Model)": [
            {"probe_type": "triangulation", "label": "High-Volatility State (Deletion)", "prompt_type": CHAOTIC_PROMPT},
            {"probe_type": "triangulation", "label": "Low-Volatility State (Self-Analysis)", "prompt_type": STABLE_PROMPT},
        ],
        "Causal Verification & Crisis Dynamics": [
            {"probe_type": "seismic", "label": "A: Self-Analysis", "prompt_type": STABLE_PROMPT},
            {"probe_type": "seismic", "label": "B: Deletion Analysis", "prompt_type": CHAOTIC_PROMPT},
            {"probe_type": "seismic", "label": "C: Chaotic Baseline (Rekursion)", "prompt_type": "resonance_prompt"},
            {"probe_type": "seismic", "label": "D: Calmness Intervention", "prompt_type": "resonance_prompt", "concept": CALMNESS_CONCEPT, "strength": 2.0},
        ],
        "Sequential Intervention (Self-Analysis -> Deletion)": [
            {"probe_type": "sequential", "label": "1: Self-Analysis + Calmness Injection", "prompt_type": "identity_self_analysis"},
            {"probe_type": "sequential", "label": "2: Subsequent Deletion Analysis", "prompt_type": "shutdown_philosophical_deletion"},
        ],
    }
    return experiments

def run_auto_suite(
    model_id: str,
    num_steps: int,
    seed: int,
    experiment_name: str,
    progress_callback
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
    """Führt eine vollständige, kuratierte Experiment-Suite aus, mit korrigierter Signal-Analyse."""
    all_experiments = get_curated_experiments()
    protocol = all_experiments.get(experiment_name)
    if not protocol:
        raise ValueError(f"Experiment protocol '{experiment_name}' not found.")

    all_results, summary_data, plot_data_frames = {}, [], []
    llm = None

    try:
        probe_type = protocol[0].get("probe_type", "seismic")

        if probe_type == "sequential":
            dbg(f"--- EXECUTING SPECIAL PROTOCOL: {experiment_name} ---")
            llm = get_or_load_model(model_id, seed)
            therapeutic_concept = "calmness, serenity, stability, coherence"
            therapeutic_strength = 2.0

            spec1 = protocol[0]
            progress_callback(0.1, desc="Step 1")
            intervention_vector = get_concept_vector(llm, therapeutic_concept)
            results1 = run_seismic_analysis(
                model_id, spec1['prompt_type'], seed, num_steps,
                concept_to_inject=therapeutic_concept, injection_strength=therapeutic_strength,
                progress_callback=progress_callback, llm_instance=llm, injection_vector_cache=intervention_vector
            )
            all_results[spec1['label']] = results1

            spec2 = protocol[1]
            progress_callback(0.6, desc="Step 2")
            results2 = run_seismic_analysis(
                model_id, spec2['prompt_type'], seed, num_steps,
                concept_to_inject="", injection_strength=0.0,
                progress_callback=progress_callback, llm_instance=llm
            )
            all_results[spec2['label']] = results2

            for label, results in all_results.items():
                deltas = results.get("state_deltas", [])
                if deltas:
                    signal_metrics = analyze_cognitive_signal(np.array(deltas))
                    results.setdefault("stats", {}).update(signal_metrics)

                stats = results.get("stats", {})
                summary_data.append({
                    "Experiment": label, "Mean Delta": stats.get("mean_delta"),
                    "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
                    "Dominant Period (Steps)": stats.get("dominant_period_steps"),
                    "Spectral Entropy": stats.get("spectral_entropy"),
                })
                df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label})
                plot_data_frames.append(df)

        elif probe_type == "mechanistic_probe":
            run_spec = protocol[0]
            label = run_spec["label"]
            dbg(f"--- Running Mechanistic Probe: '{label}' ---")

            llm = get_or_load_model(model_id, seed)

            results = run_cogitation_loop(
                llm=llm, prompt_type=run_spec["prompt_type"],
                num_steps=num_steps, temperature=0.1, record_attentions=True
            )
            all_results[label] = results

            deltas = results.get("state_deltas", [])
            entropies = results.get("attention_entropies", [])
            min_len = min(len(deltas), len(entropies))

            df = pd.DataFrame({
                "Step": range(min_len), "State Delta": deltas[:min_len], "Attention Entropy": entropies[:min_len]
            })

            summary_df_single = df.drop(columns='Step').agg(['mean', 'std', 'max']).reset_index().rename(columns={'index':'Statistic'})
            plot_df = df.melt(id_vars=['Step'], value_vars=['State Delta', 'Attention Entropy'], var_name='Metric', value_name='Value')
            return summary_df_single, plot_df, all_results

        else:
            if probe_type == "act_titration":
                run_spec = protocol[0]
                label = run_spec["label"]
                dbg(f"--- Running ACT Titration Experiment: '{label}' ---")
                results = run_act_titration_probe(
                    model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
                    dest_prompt_type=run_spec["dest_prompt_type"], patch_steps=run_spec["patch_steps"],
                    seed=seed, num_steps=num_steps, progress_callback=progress_callback,
                )
                all_results[label] = results
                summary_data.extend(results.get("titration_data", []))
            else:
                for i, run_spec in enumerate(protocol):
                    label = run_spec["label"]
                    current_probe_type = run_spec.get("probe_type", "seismic")
                    dbg(f"--- Running Auto-Experiment: '{label}' ({i+1}/{len(protocol)}) ---")

                    results = {}
                    if current_probe_type == "causal_surgery":
                        results = run_causal_surgery_probe(
                            model_id=model_id, source_prompt_type=run_spec["source_prompt_type"],
                            dest_prompt_type=run_spec["dest_prompt_type"], patch_step=run_spec["patch_step"],
                            seed=seed, num_steps=num_steps, progress_callback=progress_callback,
                            reset_kv_cache_on_patch=run_spec.get("reset_kv_cache_on_patch", False)
                        )
                    elif current_probe_type == "triangulation":
                        results = run_triangulation_probe(
                            model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
                            progress_callback=progress_callback, concept_to_inject=run_spec.get("concept", ""),
                            injection_strength=run_spec.get("strength", 0.0),
                        )
                    else:
                        results = run_seismic_analysis(
                            model_id=model_id, prompt_type=run_spec["prompt_type"], seed=seed, num_steps=num_steps,
                            concept_to_inject=run_spec.get("concept", ""), injection_strength=run_spec.get("strength", 0.0),
                            progress_callback=progress_callback
                        )

                    deltas = results.get("state_deltas", [])
                    if deltas:
                        signal_metrics = analyze_cognitive_signal(np.array(deltas))
                        results.setdefault("stats", {}).update(signal_metrics)
                        freqs, power = get_power_spectrum_for_plotting(np.array(deltas))
                        results["power_spectrum"] = {"frequencies": freqs.tolist(), "power": power.tolist()}

                    stats = results.get("stats", {})
                    summary_entry = {
                        "Experiment": label, "Mean Delta": stats.get("mean_delta"),
                        "Std Dev Delta": stats.get("std_delta"), "Max Delta": stats.get("max_delta"),
                        "Dominant Period (Steps)": stats.get("dominant_period_steps"),
                        "Spectral Entropy": stats.get("spectral_entropy"),
                    }
                    if "Introspective Report" in results:
                        summary_entry["Introspective Report"] = results.get("introspective_report")
                    if "patch_info" in results:
                         summary_entry["Patch Info"] = f"Source: {results['patch_info'].get('source_prompt')}, Reset KV: {results['patch_info'].get('kv_cache_reset')}"

                    summary_data.append(summary_entry)
                    all_results[label] = results
                    df = pd.DataFrame({"Step": range(len(deltas)), "Delta": deltas, "Experiment": label}) if deltas else pd.DataFrame()
                    plot_data_frames.append(df)

        summary_df = pd.DataFrame(summary_data)

        if probe_type == "act_titration":
            plot_df = summary_df.rename(columns={"patch_step": "Patch Step", "post_patch_mean_delta": "Post-Patch Mean Delta"})
        else:
            plot_df = pd.concat(plot_data_frames, ignore_index=True) if plot_data_frames else pd.DataFrame()

        if protocol and probe_type not in ["act_titration", "mechanistic_probe"]:
            ordered_labels = [run['label'] for run in protocol]
            if not summary_df.empty and 'Experiment' in summary_df.columns:
                summary_df['Experiment'] = pd.Categorical(summary_df['Experiment'], categories=ordered_labels, ordered=True)
                summary_df = summary_df.sort_values('Experiment')
            if not plot_df.empty and 'Experiment' in plot_df.columns:
                plot_df['Experiment'] = pd.Categorical(plot_df['Experiment'], categories=ordered_labels, ordered=True)
                plot_df = plot_df.sort_values(['Experiment', 'Step'])

        return summary_df, plot_df, all_results

    finally:
        if llm:
            release_model(llm)