| """ |
| Retrieval modes for EEG semantic decoding. |
| |
| Each mode is a callable class that takes: |
| embedding_index: EmbeddingIndex |
| nexus_conn: sqlite3 connection |
| semantic_embedding: torch.Tensor (the current predicted text embedding) |
| |
| And returns: |
| list of strings (lines to print), or empty list (suppress output this frame) |
| |
| Drop into EEGSemanticProcessor by replacing the find_similar_messages + _print_unique_lines |
| path with: mode.step(semantic_embedding) -> lines |
| """ |
|
|
| import numpy as np |
| import torch |
| import hashlib |
| import random |
| from collections import deque |
|
|
|
|
| def fix_encoding(s): |
| if not s: |
| return s |
| if isinstance(s, str): |
| b = s.encode('utf-8', 'surrogateescape') |
| else: |
| b = s |
| fixed = b.decode('utf-8', 'replace') |
| if 'ì' in s or 'í' in s or 'ï' in s: |
| return "" |
| return fixed |
|
|
|
|
| def _retrieve(embedding_index, nexus_conn, embedding_np, k=64, assistant_only=False): |
| """Shared retrieval helper. Returns list of (content, distance) tuples.""" |
| if len(embedding_np.shape) == 1: |
| embedding_np = embedding_np.reshape(1, -1) |
|
|
| distances, indices = embedding_index.search(embedding_np, k) |
| distances = distances.flatten() |
| indices = indices.flatten() |
|
|
| cursor = nexus_conn.cursor() |
| query = "SELECT content FROM messages WHERE id = ?" |
| if assistant_only: |
| query += " AND role = 'assistant'" |
|
|
| results = [] |
| for msg_id, dist in zip(indices, distances): |
| cursor.execute(query, (int(msg_id),)) |
| row = cursor.fetchone() |
| if row and row[0]: |
| results.append((row[0], float(dist))) |
| return results |
|
|
|
|
| def _lines_from_messages(messages, max_lines=60): |
| """Extract individual lines from message contents, deduplicated.""" |
| lines = [] |
| seen = set() |
| for content in messages: |
| for line in content.splitlines(): |
| line = line.strip() |
| if not line: |
| continue |
| line = fix_encoding(line) |
| if not line: |
| continue |
| if line not in seen: |
| seen.add(line) |
| lines.append(line) |
| if len(lines) >= max_lines: |
| return lines |
| return lines |
|
|
|
|
| class FloodMode: |
| """ |
| Original behavior: retrieve k candidates, sample, deduplicate against |
| recent windows. Fast, noisy, good for raw stream-of-consciousness. |
| """ |
|
|
| def __init__(self, embedding_index, nexus_conn, search_k=180, final_k=90, |
| sample_size=42, last_n=3): |
| self.embedding_index = embedding_index |
| self.nexus_conn = nexus_conn |
| self.search_k = search_k |
| self.final_k = final_k |
| self.sample_size = sample_size |
| self.previous_sets = deque(maxlen=last_n) |
|
|
| def step(self, semantic_embedding): |
| emb_np = semantic_embedding.detach().cpu().numpy() |
| results = _retrieve(self.embedding_index, self.nexus_conn, emb_np, |
| k=self.search_k) |
|
|
| messages = [content for content, _ in results[:self.final_k]] |
| if not messages: |
| return [] |
|
|
| sample = random.sample(messages, min(self.sample_size, len(messages))) |
|
|
| current_lines = set() |
| for msg in sample: |
| for line in msg.splitlines(): |
| line = line.strip() |
| if line: |
| current_lines.add(line) |
|
|
| unique = current_lines.copy() |
| for prev in self.previous_sets: |
| unique -= prev |
| self.previous_sets.append(current_lines) |
|
|
| unique = [l for l in map(fix_encoding, unique) if l] |
| return sorted(unique) |
|
|
|
|
| class DriftMode: |
| """ |
| Emit output only when the semantic pointer moves significantly. |
| Retrieves based on the *direction* of movement (current - previous), |
| added to the current position. This amplifies whatever the signal |
| is shifting toward. |
| |
| Parameters: |
| move_threshold: minimum cosine distance between consecutive |
| embeddings to trigger output |
| amplify: how much to weight the delta (1.0 = pure direction, |
| 0.0 = pure position) |
| search_k: candidates to retrieve |
| cooldown: minimum steps between outputs |
| """ |
|
|
| def __init__(self, embedding_index, nexus_conn, search_k=64, |
| move_threshold=0.05, amplify=0.5, cooldown=3, max_lines=30): |
| self.embedding_index = embedding_index |
| self.nexus_conn = nexus_conn |
| self.search_k = search_k |
| self.move_threshold = move_threshold |
| self.amplify = amplify |
| self.cooldown = cooldown |
| self.max_lines = max_lines |
|
|
| self.prev_embedding = None |
| self.steps_since_emit = 0 |
| self.prev_lines = set() |
|
|
| def step(self, semantic_embedding): |
| emb_np = semantic_embedding.detach().cpu().numpy().flatten() |
|
|
| |
| norm = np.linalg.norm(emb_np) |
| if norm > 0: |
| emb_normed = emb_np / norm |
| else: |
| emb_normed = emb_np |
|
|
| self.steps_since_emit += 1 |
|
|
| if self.prev_embedding is None: |
| self.prev_embedding = emb_normed |
| return [] |
|
|
| |
| cos_sim = np.dot(emb_normed, self.prev_embedding) |
| cos_dist = 1.0 - cos_sim |
|
|
| if cos_dist < self.move_threshold or self.steps_since_emit < self.cooldown: |
| return [] |
|
|
| |
| delta = emb_normed - self.prev_embedding |
| delta_norm = np.linalg.norm(delta) |
| if delta_norm > 0: |
| delta = delta / delta_norm |
|
|
| |
| query = emb_normed + self.amplify * delta |
| query_norm = np.linalg.norm(query) |
| if query_norm > 0: |
| query = query / query_norm |
|
|
| self.prev_embedding = emb_normed |
| self.steps_since_emit = 0 |
|
|
| results = _retrieve(self.embedding_index, self.nexus_conn, |
| query.reshape(1, -1), k=self.search_k) |
|
|
| messages = [content for content, _ in results] |
| lines = _lines_from_messages(messages, self.max_lines) |
|
|
| |
| lines = [l for l in lines if l not in self.prev_lines] |
| self.prev_lines = set(lines) |
|
|
| return lines |
|
|
|
|
| class FocusMode: |
| """ |
| Maintain an exponential moving average of embeddings. Only emit |
| when the centroid shifts enough. Surfaces the persistent underlying |
| theme rather than moment-to-moment noise. |
| |
| Parameters: |
| alpha: EMA smoothing factor (lower = smoother, more stable) |
| shift_threshold: minimum cosine distance of centroid movement to emit |
| search_k: candidates to retrieve |
| top_n: how many top results to show (closest to centroid) |
| """ |
|
|
| def __init__(self, embedding_index, nexus_conn, search_k=48, |
| alpha=0.15, shift_threshold=0.02, top_n=20, max_lines=25): |
| self.embedding_index = embedding_index |
| self.nexus_conn = nexus_conn |
| self.search_k = search_k |
| self.alpha = alpha |
| self.shift_threshold = shift_threshold |
| self.top_n = top_n |
| self.max_lines = max_lines |
|
|
| self.centroid = None |
| self.last_emit_centroid = None |
| self.prev_lines = set() |
|
|
| def step(self, semantic_embedding): |
| emb_np = semantic_embedding.detach().cpu().numpy().flatten() |
|
|
| norm = np.linalg.norm(emb_np) |
| if norm > 0: |
| emb_normed = emb_np / norm |
| else: |
| emb_normed = emb_np |
|
|
| |
| if self.centroid is None: |
| self.centroid = emb_normed.copy() |
| self.last_emit_centroid = emb_normed.copy() |
| return [] |
|
|
| self.centroid = self.alpha * emb_normed + (1.0 - self.alpha) * self.centroid |
|
|
| |
| c_norm = np.linalg.norm(self.centroid) |
| if c_norm > 0: |
| centroid_normed = self.centroid / c_norm |
| else: |
| centroid_normed = self.centroid |
|
|
| |
| cos_sim = np.dot(centroid_normed, self.last_emit_centroid) |
| cos_dist = 1.0 - cos_sim |
|
|
| if cos_dist < self.shift_threshold: |
| return [] |
|
|
| self.last_emit_centroid = centroid_normed.copy() |
|
|
| |
| results = _retrieve(self.embedding_index, self.nexus_conn, |
| centroid_normed.reshape(1, -1), k=self.search_k) |
|
|
| messages = [content for content, _ in results[:self.top_n]] |
| lines = _lines_from_messages(messages, self.max_lines) |
|
|
| |
| lines = [l for l in lines if l not in self.prev_lines] |
| self.prev_lines = set(_lines_from_messages( |
| [content for content, _ in results[:self.top_n]], self.max_lines)) |
|
|
| return lines |
|
|
|
|
| class LayeredMode: |
| """ |
| Run multiple timescales simultaneously. Show three sections: |
| |
| [fast] — what just changed (high threshold, small k) |
| [mid] — recent theme (EMA with medium alpha) |
| [slow] — deep undercurrent (EMA with low alpha) |
| |
| Each layer only emits its section when its own threshold is crossed. |
| At least one layer must fire for any output. |
| """ |
|
|
| def __init__(self, embedding_index, nexus_conn, search_k=48, max_lines_per_layer=10): |
| self.layers = { |
| 'fast': DriftMode(embedding_index, nexus_conn, search_k=search_k, |
| move_threshold=0.08, amplify=0.7, cooldown=1, |
| max_lines=max_lines_per_layer), |
| 'mid': FocusMode(embedding_index, nexus_conn, search_k=search_k, |
| alpha=0.25, shift_threshold=0.03, top_n=16, |
| max_lines=max_lines_per_layer), |
| 'slow': FocusMode(embedding_index, nexus_conn, search_k=search_k, |
| alpha=0.05, shift_threshold=0.015, top_n=12, |
| max_lines=max_lines_per_layer), |
| } |
|
|
| def step(self, semantic_embedding): |
| sections = {} |
| for name, layer in self.layers.items(): |
| lines = layer.step(semantic_embedding) |
| if lines: |
| sections[name] = lines |
|
|
| if not sections: |
| return [] |
|
|
| output = [] |
| for name in ['fast', 'mid', 'slow']: |
| if name in sections: |
| output.append(f"── {name} ──") |
| output.extend(sections[name]) |
| output.append("") |
|
|
| return output |
|
|