Upload 5 files

Browse files

Files changed (5) hide show

cyton.py +381 -0
decode.py +612 -0
eegembed.py +543 -0
embed.py +424 -0
morphism.py +436 -0

cyton.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import argparse
+import sys
+import os
+import serial
+import time
+import paramiko
+import io
+from pathlib import Path
+from datetime import datetime
+def create_ssh_connection():
+    """Create SSH connection to remote server"""
+    ssh = paramiko.SSHClient()
+    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    try:
+        ssh.connect('topos.exypno.tech', port=420, username='albert')
+        return ssh
+    except Exception as e:
+        print(f"Failed to connect to remote server: {e}")
+        return None
+def set_gain(ser, gain=8):
+    """Set 2x gain on all channels (1-16) for Cyton+Daisy"""
+    print(f"Setting {gain}x gain on all channels...")
+    # Stop any streaming first
+    ser.write(b's')
+    time.sleep(0.5)
+    gain_mapping = [1, 2, 4, 6, 8, 12, 24]
+    gain_val = gain_mapping.index(gain)
+    # Main board channels (1-8)
+    main_channels = ['1', '2', '3', '4', '5', '6', '7', '8']
+    # Daisy board channels (9-16 represented as Q-I)
+    daisy_channels = ['Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I']
+    # Combine all channel commands into one string
+    commands = ''
+    for ch in main_channels + daisy_channels:
+        commands += f'x{ch}0{gain_val}0000X'
+    # Send all commands at once
+    ser.write(commands.encode())
+    time.sleep(0.5)
+    # Clear any response from the serial buffer
+    ser.reset_input_buffer()
+    print("Gain settings updated")
+def set_sample_rate(ser, freq):
+    """Set sample rate using the '~' command"""
+    # Sample rate mapping according to documentation
+    freq_mapping = {
+        16000: '0',
+        8000: '1',
+        4000: '2',
+        2000: '3',
+        1000: '4',
+        500: '5',
+        250: '6'
+    }
+    if freq not in freq_mapping:
+        raise ValueError(f"Unsupported frequency {freq}Hz. Supported rates: {list(freq_mapping.keys())}")
+    # Stop any streaming first
+    ser.write(b's')
+    time.sleep(0.5)
+    # Set sample rate
+    command = f"~{freq_mapping[freq]}"
+    ser.write(command.encode())
+    time.sleep(0.5)
+    # Clear response from buffer
+    ser.reset_input_buffer()
+    print(f"Sample rate set to {freq}Hz")
+def init_board(ser):
+    """Initialize the OpenBCI board for 16 channels"""
+    print("Initializing board...")
+    # Stop any previous streaming
+    ser.write(b's')
+    time.sleep(1)
+    # Soft reset
+    ser.write(b'v')
+    time.sleep(2)
+    # Clear buffers
+    ser.reset_input_buffer()
+    ser.reset_output_buffer()
+    # Enable 16 channel mode
+    ser.write(b'C')
+    time.sleep(1)
+    # Enable all channels (1-16)
+    # First 8 channels
+    commands = [b'!', b'@', b'#', b'$', b'%', b'^', b'&', b'*']
+    # Next 8 channels (Daisy module)
+    commands.extend([b'Q', b'W', b'E', b'R', b'T', b'Y', b'U', b'I'])
+    for cmd in commands:
+        ser.write(cmd)
+        time.sleep(0.1)
+    # Set high-speed mode
+    ser.write(b'\xF0\x06')  # Set baud rate to 230400
+    time.sleep(1)
+    ser.baudrate = 230400
+    set_gain(ser, gain=2)
+    print("Board initialized")
+def find_packet_start(ser):
+    """Find the start of a packet by looking for 0xA0 header"""
+    while True:
+        byte = ser.read()
+        if byte[0] == 0xA0:  # Header byte
+            return byte
+    return None
+def read_complete_packet(ser):
+    """Read a complete packet ensuring proper alignment"""
+    # Find the start of packet
+    start_byte = find_packet_start(ser)
+    if not start_byte:
+        return None
+    # Read remaining 32 bytes
+    remaining_bytes = ser.read(32)
+    if len(remaining_bytes) != 32:
+        return None
+    # Verify footer byte (0xCx)
+    if (remaining_bytes[31] & 0xF0) != 0xC0:
+        return None
+    return start_byte + remaining_bytes
+def process_packet(packet):
+    """Process a 33-byte packet and extract channel data according to documentation"""
+    if len(packet) != 33:
+        return None
+    channels = []
+    for i in range(8):
+        start_idx = 2 + (i * 3)  # Start at byte 2 (after header and sample number)
+        channel_data = packet[start_idx:start_idx + 3]
+        # Convert 24-bit to 32-bit signed int according to documentation
+        if (channel_data[0] & 0x80):  # If negative number
+            value = -1 * ((((~channel_data[0] & 0xFF) << 16) |
+                          ((~channel_data[1] & 0xFF) << 8) |
+                          (~channel_data[2] & 0xFF)) + 1)
+        else:  # If positive number
+            value = (channel_data[0] << 16) | (channel_data[1] << 8) | channel_data[2]
+        # Convert to microvolts: 4.5V / gain / (2^23 - 1)
+        scale_factor = 4.5 / (24.0 * 8388607.0) * 1000000  # Using gain of 24
+        channels.append(value * scale_factor)
+    return channels
+def start_sd_recording(ser, duration='G'):
+    """Start recording to SD card with specified duration
+    Duration codes:
+    A = 5MIN
+    S = 15MIN
+    F = 30MIN
+    G = 1HR (default)
+    H = 2HR
+    J = 4HR
+    K = 12HR
+    L = 24HR
+    a = ~14sec (test)
+    """
+    valid_durations = {'A', 'S', 'F', 'G', 'H', 'J', 'K', 'L', 'a'}
+    if duration not in valid_durations:
+        raise ValueError(f"Invalid duration code. Valid codes: {valid_durations}")
+    print(f"Starting SD card recording with duration code {duration}")
+    ser.write(duration.encode())
+    time.sleep(0.5)
+    ser.write(b'b')
+    time.sleep(0.5)
+def stop_sd_recording(ser):
+    """Stop SD card recording"""
+    print("Stopping SD card recording")
+    ser.write(b's')
+    time.sleep(0.5)
+    ser.write(b'j')
+    time.sleep(0.5)
+def sd_record(port, duration='G', sample_rate=1000):
+    """Record data to SD card"""
+    duration_map = {
+        'A': 5*60,      # 5 minutes
+        'S': 15*60,     # 15 minutes
+        'F': 30*60,     # 30 minutes
+        'G': 60*60,     # 1 hour
+        'H': 2*60*60,   # 2 hours
+        'J': 4*60*60,   # 4 hours
+        'K': 12*60*60,  # 12 hours
+        'L': 24*60*60,  # 24 hours
+        'a': 14         # ~14 seconds (test)
+    }
+    # Open serial port
+    ser = serial.Serial(port, 115200)
+    time.sleep(2)
+    try:
+        # Initialize board
+        init_board(ser)
+        # Set sample rate
+        set_sample_rate(ser, sample_rate)
+        # Start recording
+        start_sd_recording(ser, duration)
+        # Calculate wait time
+        wait_time = duration_map[duration]
+        start_time = time.time()
+        print(f"Recording to SD card for {wait_time} seconds...")
+        try:
+            while (time.time() - start_time) < wait_time:
+                remaining = wait_time - (time.time() - start_time)
+                print(f"\rRecording... {remaining:.1f} seconds remaining  ", end='')
+                time.sleep(0.1)
+        except KeyboardInterrupt:
+            print("\nRecording interrupted by user")
+        finally:
+            # Always stop recording
+            stop_sd_recording(ser)
+            print("\nRecording complete")
+    finally:
+        ser.close()
+def main():
+    parser = argparse.ArgumentParser(description='OpenBCI EEG Recording Tool')
+    parser.add_argument('--port', '-p', type=str, default='/dev/ttyUSB0',
+                        help='Serial port to use (default: /dev/ttyUSB0)')
+    parser.add_argument('--filename', '-o', type=str,
+                        help='Output filename (default: openbci_<timestamp>.txt)')
+    parser.add_argument('--sd', action='store_true',
+                        help='Record to SD card instead of streaming to PC')
+    parser.add_argument('--duration', type=str, default='G',
+                        help='SD card recording duration code (default: G = 1 hour)')
+    parser.add_argument('--sample-rate', type=int, default=1000,
+                        help='Sample rate in Hz (default: 1000)')
+    parser.add_argument('--remote', action='store_true',
+                        help='Write to remote server instead of local file')
+    args = parser.parse_args()
+    if args.sd:
+        sd_record(args.port, args.duration, args.sample_rate)
+        return
+    if args.filename is None:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        args.filename = f"openbci_{timestamp}.txt"
+    # Open serial port
+    ser = serial.Serial(args.port, 115200)
+    time.sleep(2)
+    init_board(ser)
+    filename = args.filename
+    if args.remote:
+        ssh = create_ssh_connection()
+        if not ssh:
+            print("Failed to establish SSH connection. Exiting.")
+            return
+        sftp = ssh.open_sftp()
+        remote_file = sftp.open(filename, 'w')
+        # Write header
+        header = "Timestamp,"
+        header += ",".join([f"Channel{i+1}" for i in range(16)])
+        header += "\n"
+        remote_file.write(header)
+    else:
+        # Original local file writing
+        with open(filename, 'w') as f:
+            header = "Timestamp,"
+            header += ",".join([f"Channel{i+1}" for i in range(16)])
+            header += "\n"
+            f.write(header)
+    # Start streaming
+    ser.write(b'b')
+    time.sleep(0.5)
+    ser.reset_input_buffer()
+    print(f"Started recording to {filename}")
+    packet_count = 0
+    start_time = time.time()
+    buffer = io.StringIO()
+    last_write = time.time()
+    try:
+        while True:
+            # Read two properly aligned packets
+            packet1 = read_complete_packet(ser)
+            if packet1:
+                packet2 = read_complete_packet(ser)
+                if packet2:
+                    # Process both packets
+                    data1 = process_packet(packet1)
+                    data2 = process_packet(packet2)
+                    if data1 and data2:
+                        packet_count += 1
+                        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+                        all_channels = data1 + data2  # Combine all 16 channels
+                        data_str = [f"{x:.6f}" for x in all_channels]
+                        line = timestamp + "," + ",".join(data_str) + "\n"
+                        if args.remote:
+                            buffer.write(line)
+                            # Write buffer every 100ms
+                            if time.time() - last_write >= 0.1:
+                                remote_file.write(buffer.getvalue())
+                                buffer = io.StringIO()
+                                last_write = time.time()
+                        else:
+                            with open(filename, 'a') as f:
+                                f.write(line)
+                        # Print status every second
+                        if packet_count % 125 == 0:
+                            elapsed_time = time.time() - start_time
+                            rate = packet_count / elapsed_time
+                            print(f"\rRecording... {rate:.1f} Hz, {packet_count} packets", end='')
+            # Check for buffer overflow
+            if ser.in_waiting > 1000:
+                print(f"\nWarning: Buffer building up ({ser.in_waiting} bytes)")
+                ser.reset_input_buffer()
+    except KeyboardInterrupt:
+        # Stop streaming
+        ser.write(b's')
+        ser.close()
+        if args.remote:
+            # Write any remaining data in buffer
+            if buffer.getvalue():
+                remote_file.write(buffer.getvalue())
+            remote_file.close()
+            sftp.close()
+            ssh.close()
+        # Print final statistics
+        elapsed_time = time.time() - start_time
+        rate = packet_count / elapsed_time
+        print(f"\n\nRecording stopped")
+        print(f"Duration: {elapsed_time:.1f} seconds")
+        print(f"Packets recorded: {packet_count}")
+        print(f"Average sample rate: {rate:.1f} Hz")
+        print(f"Data saved to: {filename}")
+if __name__ == "__main__":
+    main()

decode.py ADDED Viewed

	@@ -0,0 +1,612 @@

+#!/usr/bin/env python3
+import os
+import sys
+import time
+import hashlib
+import numpy as np
+import torch
+import sqlite3
+import logging
+import argparse
+import random
+import traceback
+import faiss
+import pickle
+from datetime import datetime
+from collections import deque, defaultdict
+from typing import List, Dict, Tuple, Optional, Union, Any
+from pathlib import Path
+# Import from our streaming module
+from eegembed import EEGEmbeddingStream
+PRINT_DEBUG_HASH = False
+def fix_encoding(s):
+    if not s:
+        return s
+    if isinstance(s, str):
+        b = s.encode('utf-8', 'surrogateescape')
+    else:
+        b = s
+    fixed = b.decode('utf-8', 'replace')
+    if 'ì' in s or 'í' in s or 'ï' in s:
+        return ""
+    return fixed
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger("EEGSemanticStream")
+def setup_eeg_logger(eeg_file_path):
+    """Set up a file logger based on the EEG filename."""
+    base_name = os.path.basename(eeg_file_path)
+    file_name = os.path.splitext(base_name)[0]
+    logs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "session_logs")
+    if not os.path.exists(logs_dir):
+        os.makedirs(logs_dir)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_file_path = os.path.join(logs_dir, f"{file_name}_{timestamp}.log")
+    log_file = open(log_file_path, "w", encoding="utf-8")
+    log_file.write(f"--- Session started at {timestamp} for EEG file: {base_name} ---\n")
+    log_file.flush()
+    return log_file
+class EmbeddingIndex:
+    def __init__(self, dim=1536, use_gpu=True):
+        self.dim = dim
+        self.use_gpu = use_gpu and faiss.get_num_gpus() > 0
+        self.index = None
+        self.gpu_resources = None
+        self.message_ids = []
+        if self.use_gpu:
+            self.gpu_resources = faiss.StandardGpuResources()
+    def add_embeddings(self, embeddings: np.ndarray, message_ids: List[int]):
+        logger.info(f"Building FAISS index with {len(embeddings)} embeddings")
+        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+        embeddings = embeddings / (norms + 1e-8)
+        cpu_index = faiss.IndexFlatIP(self.dim)
+        cpu_index.add(embeddings.astype(np.float32))
+        if self.use_gpu:
+            try:
+                self.index = faiss.index_cpu_to_gpu(self.gpu_resources, 0, cpu_index)
+                logger.info("Using GPU FAISS index")
+            except Exception as e:
+                logger.warning(f"GPU failed: {e}. Using CPU.")
+                self.index = cpu_index
+                self.use_gpu = False
+        else:
+            self.index = cpu_index
+            logger.info("Using CPU FAISS index")
+        self.message_ids = message_ids
+    def get_current_count(self):
+        if self.index is None:
+            return 0
+        return self.index.ntotal
+    def search(self, query: np.ndarray, k: int = 10) -> Tuple[np.ndarray, np.ndarray]:
+        if self.index is None:
+            raise RuntimeError("Index not initialized")
+        norm = np.linalg.norm(query)
+        if norm > 0:
+            query = query / norm
+        actual_k = min(k, self.get_current_count())
+        if actual_k == 0:
+            return np.array([]), np.array([])
+        similarities, indices = self.index.search(query.astype(np.float32), actual_k)
+        distances = 1.0 - similarities
+        labels = np.array([[self.message_ids[idx] for idx in row] for row in indices])
+        return distances, labels
+    def save(self, path: str):
+        if self.index is None:
+            raise RuntimeError("Cannot save uninitialized index")
+        if self.use_gpu:
+            cpu_index = faiss.index_gpu_to_cpu(self.index)
+            faiss.write_index(cpu_index, f"{path}.index")
+        else:
+            faiss.write_index(self.index, f"{path}.index")
+        with open(f"{path}_message_ids.pkl", 'wb') as f:
+            pickle.dump(self.message_ids, f)
+    @classmethod
+    def load(cls, path: str, use_gpu: bool = True) -> 'EmbeddingIndex':
+        with open(f"{path}_message_ids.pkl", 'rb') as f:
+            message_ids = pickle.load(f)
+        index = cls(use_gpu=use_gpu)
+        index.message_ids = message_ids
+        cpu_index = faiss.read_index(f"{path}.index")
+        if index.use_gpu:
+            try:
+                index.index = faiss.index_cpu_to_gpu(index.gpu_resources, 0, cpu_index)
+                logger.info("Loaded existing index and moved to GPU")
+            except Exception as e:
+                logger.warning(f"Failed to move loaded index to GPU: {e}. Using CPU.")
+                index.index = cpu_index
+                index.use_gpu = False
+        else:
+            index.index = cpu_index
+            logger.info("Loaded existing index on CPU")
+        return index
+class EEGSemanticProcessor:
+    """
+    Process EEG data through autoencoder and semantic model pipeline,
+    then lookup similar messages.
+    """
+    def __init__(
+        self,
+        autoencoder_model_path: str,
+        semantic_model_path: str,
+        nexus_db_path: str,
+        embeddings_db_path: str,
+        index_path: str = None,
+        eeg_file_path: str = None,
+        window_size: int = 624,
+        stride: int = 64,
+        batch_size: int = 32,
+        normalize: bool = True,
+        device: str = None,
+        search_k: int = 180,
+        final_k: int = 90,
+        use_raw_eeg: bool = False,
+        last_n_messages: int = 3,
+        input_dim_override: int = None,
+        save_vectors: bool = False,
+        vector_output_path: str = None
+    ):
+        if device is None:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.device = torch.device(device)
+        logger.info(f"Using device: {self.device}")
+        self.last_n_messages = last_n_messages
+        self.use_raw_eeg = use_raw_eeg
+        self.input_dim_override = input_dim_override
+        # Initialize EEG stream
+        self.eeg_stream = EEGEmbeddingStream(
+            file_path=eeg_file_path if eeg_file_path else "",
+            model_path=autoencoder_model_path,
+            window_size=window_size,
+            stride=stride,
+            normalize=normalize,
+            batch_size=batch_size,
+            device=self.device
+        )
+        # Load traced semantic model
+        logger.info(f"Loading traced semantic model from {semantic_model_path}")
+        self.semantic_model = torch.jit.load(semantic_model_path, map_location=self.device)
+        self.semantic_model.eval()
+        # Probe to get input/output dims
+        # Try a few common input sizes to find the right one
+        self._semantic_input_dim = None
+        self._semantic_output_dim = None
+        for test_dim in [64, 10112]:
+            try:
+                dummy = torch.randn(1, test_dim, device=self.device)
+                with torch.no_grad():
+                    out = self.semantic_model(dummy)
+                self._semantic_input_dim = test_dim
+                self._semantic_output_dim = out.shape[1]
+                logger.info(f"Semantic model: input_dim={self._semantic_input_dim}, output_dim={self._semantic_output_dim}")
+                break
+            except Exception:
+                continue
+        if self._semantic_input_dim is None:
+            logger.warning("Could not auto-detect semantic model input dim. Will adapt at runtime.")
+        self.log_file = setup_eeg_logger(eeg_file_path) if eeg_file_path else None
+        # Initialize database connections
+        self.nexus_conn = sqlite3.connect(nexus_db_path)
+        self.embeddings_conn = sqlite3.connect(embeddings_db_path)
+        # Message tracking system
+        self.search_k = search_k
+        self.final_k = final_k
+        self.message_counts = defaultdict(int)
+        self.recent_messages = deque(maxlen=10)
+        self.repetition_penalty = 1.5
+        logger.info("Creating embedding index")
+        self.embedding_index = self._create_index(index_path)
+        self.error_count = 0
+        self.max_consecutive_errors = 5
+        self.save_vectors = save_vectors
+        self.vector_output_path = vector_output_path
+        if self.save_vectors:
+            self.vectors_list = []
+            self.timestamps = []
+            logger.info(f"Vector saving enabled. Output will be saved to {self.vector_output_path}")
+        self.previous_message_sets = deque(maxlen=self.last_n_messages)
+    def _create_index(self, index_path: str = None) -> EmbeddingIndex:
+        """Create or load the embedding index for similarity search"""
+        cursor = self.embeddings_conn.cursor()
+        cursor.execute("SELECT COUNT(*) FROM embeddings")
+        db_count = cursor.fetchone()[0]
+        cursor.execute("SELECT MAX(message_id) FROM embeddings")
+        db_max_id = cursor.fetchone()[0]
+        if index_path and os.path.exists(f"{index_path}.index"):
+            try:
+                logger.info(f"Checking existing index at {index_path}")
+                index = EmbeddingIndex.load(index_path)
+                metadata_path = f"{index_path}_metadata.npz"
+                if os.path.exists(metadata_path):
+                    metadata = np.load(metadata_path, allow_pickle=True)
+                    saved_count = int(metadata.get('count', 0))
+                    saved_max_id = int(metadata.get('max_message_id', 0))
+                    logger.info(f"Saved index: {saved_count} items, max_id={saved_max_id}")
+                    logger.info(f"Database: {db_count} items, max_id={db_max_id}")
+                    if db_count != saved_count or db_max_id != saved_max_id:
+                        logger.info("Database has changed. Recreating index...")
+                    else:
+                        logger.info("Database unchanged. Using existing index...")
+                        return index
+            except Exception as e:
+                logger.warning(f"Error checking existing index: {str(e)}")
+                logger.info("Will create new index")
+        logger.info("Creating new index from database...")
+        cursor.execute("SELECT message_id, embedding FROM embeddings ORDER BY message_id")
+        embeddings = []
+        message_ids = []
+        for message_id, emb in cursor.fetchall():
+            embedding = np.frombuffer(emb, dtype=np.float32)
+            embeddings.append(embedding)
+            message_ids.append(message_id)
+        if not embeddings:
+            raise ValueError("No embeddings found in database")
+        embeddings = np.vstack(embeddings)
+        logger.info(f"Loaded {len(embeddings)} embeddings with shape: {embeddings.shape}")
+        index = EmbeddingIndex(dim=embeddings.shape[1])
+        index.add_embeddings(embeddings, message_ids)
+        if index_path:
+            logger.info(f"Saving index to {index_path}")
+            index.save(index_path)
+            metadata = {
+                'count': db_count,
+                'max_message_id': db_max_id
+            }
+            np.savez(f"{index_path}_metadata.npz", **metadata)
+        return index
+    def process_eeg_embedding(self, eeg_embedding: np.ndarray) -> torch.Tensor:
+        """Convert EEG embedding to text embedding using the traced semantic model"""
+        with torch.no_grad():
+            tensor = torch.tensor(eeg_embedding, dtype=torch.float32).to(self.device)
+            if len(tensor.shape) < 2:
+                tensor = tensor.unsqueeze(0)
+            batch_size = tensor.shape[0]
+            tensor = tensor.reshape(batch_size, -1)
+            # Adapt dimensions if needed
+            if self._semantic_input_dim is not None:
+                current_features = tensor.shape[1]
+                if current_features != self._semantic_input_dim:
+                    if current_features < self._semantic_input_dim:
+                        padded = torch.zeros(batch_size, self._semantic_input_dim, device=self.device)
+                        padded[:, :current_features] = tensor
+                        tensor = padded
+                    else:
+                        tensor = tensor[:, :self._semantic_input_dim]
+            return self.semantic_model(tensor)
+    def find_similar_messages(self, embedding: torch.Tensor, assistant_only=False) -> List[str]:
+        """Find similar messages using the embedding index"""
+        embedding_np = embedding.detach().cpu().numpy()
+        if len(embedding_np.shape) > 1:
+            embedding_np = embedding_np.reshape(1, -1)
+        try:
+            distances, indices = self.embedding_index.search(embedding_np, self.search_k)
+            distances = distances.flatten()
+            indices = indices.flatten()
+            cursor = self.nexus_conn.cursor()
+            candidates = []
+            if assistant_only:
+                query = """
+                    SELECT content FROM messages
+                    WHERE id = ? AND role = 'assistant'
+                """
+            else:
+                query = """
+                    SELECT content FROM messages
+                    WHERE id = ?
+                """
+            for message_id, distance in zip(indices, distances):
+                cursor.execute(query, (int(message_id),))
+                if result := cursor.fetchone():
+                    content = result[0]
+                    candidates.append(content)
+            return candidates[:self.final_k]
+        except Exception as e:
+            logger.error(f"Error during similarity search: {str(e)}")
+            traceback.print_exc()
+            return []
+    def save_vectors_to_disk(self):
+        """Save the collected vectors and timestamps to disk"""
+        if not self.vectors_list:
+            logger.warning("No vectors to save")
+            return
+        output_dir = os.path.dirname(self.vector_output_path)
+        if output_dir and not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        vectors_array = np.vstack(self.vectors_list)
+        timestamps_array = np.array(self.timestamps)
+        logger.info(f"Saving {len(self.vectors_list)} vectors to {self.vector_output_path}")
+        np.savez(
+            self.vector_output_path,
+            vectors=vectors_array,
+            timestamps=timestamps_array
+        )
+        logger.info(f"Vectors saved successfully to {self.vector_output_path}")
+    def process_streaming_embeddings(self, callback=None):
+        """
+        Process streaming EEG embeddings through the semantic model
+        and find similar messages.
+        """
+        self.eeg_stream.start()
+        try:
+            consecutive_errors = 0
+            while True:
+                try:
+                    for embedding_data in self.eeg_stream.get_embeddings(timeout=0.5):
+                        try:
+                            autoencoder_embedding = embedding_data['embedding']
+                            semantic_embedding = self.process_eeg_embedding(autoencoder_embedding)
+                            if self.save_vectors:
+                                embedding_np = semantic_embedding.detach().cpu().numpy()
+                                self.vectors_list.append(embedding_np)
+                                self.timestamps.append({
+                                    'start': embedding_data['start_timestamp'],
+                                    'end': embedding_data['end_timestamp']
+                                })
+                                if len(self.vectors_list) % 100 == 0:
+                                    logger.info(f"Collected {len(self.vectors_list)} vectors so far")
+                                continue
+                            similar_messages = self.find_similar_messages(semantic_embedding)
+                            result = {
+                                'start_timestamp': embedding_data['start_timestamp'],
+                                'end_timestamp': embedding_data['end_timestamp'],
+                                'processing_time': 0,
+                                'similar_messages': similar_messages
+                            }
+                            if callback:
+                                callback(result)
+                            else:
+                                self._print_unique_lines(result)
+                            consecutive_errors = 0
+                        except Exception as e:
+                            print(f"Error: {str(e)}", file=sys.stderr)
+                            consecutive_errors += 1
+                            if consecutive_errors >= self.max_consecutive_errors:
+                                raise RuntimeError(f"Too many consecutive errors ({consecutive_errors})")
+                    time.sleep(0.01)
+                except Exception as e:
+                    if "Too many consecutive errors" in str(e):
+                        raise
+                    print(f"Error: {str(e)}", file=sys.stderr)
+                    consecutive_errors += 1
+                    if consecutive_errors >= self.max_consecutive_errors:
+                        raise RuntimeError(f"Too many consecutive errors ({consecutive_errors})")
+                    time.sleep(1)
+        except KeyboardInterrupt:
+            pass
+        except Exception as e:
+            print(f"Fatal error: {str(e)}", file=sys.stderr)
+        finally:
+            if self.save_vectors and self.vectors_list:
+                self.save_vectors_to_disk()
+            self.eeg_stream.stop()
+    def _print_unique_lines(self, result):
+        """Print only lines that aren't in common with the last n batches of messages"""
+        if not result['similar_messages']:
+            return
+        sample_size = min(42, len(result['similar_messages']))
+        current_messages = random.sample(result['similar_messages'], sample_size)
+        current_lines = set()
+        for message in current_messages:
+            for line in message.splitlines():
+                line = line.strip()
+                if line:
+                    current_lines.add(line)
+        unique_lines = current_lines.copy()
+        for previous_lines in self.previous_message_sets:
+            unique_lines -= previous_lines
+        self.previous_message_sets.append(current_lines)
+        __uniq_log_empty = False
+        if unique_lines:
+            if PRINT_DEBUG_HASH:
+                unique_lines = [f"{hash} | {line}" for (hash, line) in zip(
+                    map(lambda s: hashlib.md5(s.encode()).hexdigest()[:8], unique_lines),
+                    unique_lines)]
+            unique_lines = filter(lambda s: bool(s), map(fix_encoding, unique_lines))
+            output_text = "\n".join(sorted(unique_lines))
+            print(output_text)
+            if hasattr(self, 'log_file') and self.log_file:
+                try:
+                    self.log_file.write(output_text + "\n")
+                    self.log_file.flush()
+                except Exception as e:
+                    print(f"Error writing to log file: {str(e)}", file=sys.stderr)
+        elif __uniq_log_empty:
+            logger.info(f"No unique lines")
+def main():
+    parser = argparse.ArgumentParser(description='Process EEG data through semantic model and lookup similar messages')
+    parser.add_argument('--autoencoder', '-a', type=str, required=True,
+                        help='Path to the traced autoencoder encoder model')
+    parser.add_argument('--semantic-model', '-s', type=str, required=True,
+                        help='Path to the traced semantic model')
+    parser.add_argument('--nexus-db', '-n', type=str,
+                        default=os.path.expanduser('~/.nexus/data/nexus-new.db'),
+                        help='Path to the nexus database')
+    parser.add_argument('--embeddings-db', '-e', type=str, default='emb_full.db',
+                        help='Path to the embeddings database')
+    parser.add_argument('--index', '-i', type=str, default='embedding_index',
+                        help='Path to save/load the FAISS index')
+    parser.add_argument('--eeg-file', '-f', type=str, required=True,
+                        help='Path to the EEG data file to monitor')
+    parser.add_argument('--window-size', type=int, default=624,
+                        help='Window size in samples')
+    parser.add_argument('--stride', type=int, default=32,
+                        help='Stride between windows')
+    parser.add_argument('--batch-size', type=int, default=32,
+                        help='Batch size for processing')
+    parser.add_argument('--no-normalize', dest='normalize', action='store_false',
+                        help='Disable normalization of EEG data')
+    parser.add_argument('--search-k', type=int, default=180,
+                        help='Number of candidates to retrieve for selection')
+    parser.add_argument('--final-k', type=int, default=90,
+                        help='Number of results to show')
+    parser.add_argument('--device', type=str, default=None,
+                        help='Device to use (cuda or cpu)')
+    parser.add_argument('--last_n', type=int, default=None,
+                        help='Window queue size for repetition filter')
+    parser.add_argument('--use-raw-eeg', action='store_true',
+                        help='Use raw EEG data with semantic model (skip autoencoder)')
+    parser.add_argument('--input-dim', type=int,
+                        help='Override the input dimension for the semantic model')
+    parser.add_argument('--save-vectors', action='store_true',
+                        help='Save semantic vectors to disk without generating output')
+    parser.add_argument('--vector-output', type=str, default='semantic_vectors.npz',
+                        help='Path to save the semantic vectors')
+    args = parser.parse_args()
+    processor = EEGSemanticProcessor(
+        autoencoder_model_path=args.autoencoder,
+        semantic_model_path=args.semantic_model,
+        nexus_db_path=args.nexus_db,
+        embeddings_db_path=args.embeddings_db,
+        index_path=args.index,
+        last_n_messages=args.last_n,
+        eeg_file_path=args.eeg_file,
+        window_size=args.window_size,
+        stride=args.stride,
+        batch_size=args.batch_size,
+        normalize=args.normalize,
+        device=args.device,
+        search_k=args.search_k,
+        final_k=args.final_k,
+        use_raw_eeg=args.use_raw_eeg,
+        input_dim_override=args.input_dim,
+        save_vectors=args.save_vectors,
+        vector_output_path=args.vector_output
+    )
+    try:
+        processor.process_streaming_embeddings()
+    except KeyboardInterrupt:
+        pass
+    except Exception as e:
+        print(f"Error: {str(e)}", file=sys.stderr)
+if __name__ == "__main__":
+    main()

eegembed.py ADDED Viewed

	@@ -0,0 +1,543 @@

+import os
+import time
+import csv
+import numpy as np
+import torch
+import threading
+import queue
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional, Callable, Generator, Union, Any
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger("EEGStream")
+class EncoderExtractor:
+    def __init__(self, model_path, device=None, force_sequence_length=None):
+        if device is None:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.device = device
+        self.force_sequence_length = force_sequence_length
+        logger.info(f"Loading traced encoder from {model_path} to {self.device}")
+        self.model = torch.jit.load(model_path, map_location=self.device)
+        self.model.eval()
+        dummy = torch.randn(1, 16, force_sequence_length or 624, device=self.device)
+        with torch.no_grad():
+            self._embedding_size = self.model(dummy).shape[1]
+        logger.info(f"Embedding size: {self._embedding_size}")
+    def get_embedding_size(self):
+        return self._embedding_size
+    def embed(self, data):
+        if self.force_sequence_length and data.shape[2] != self.force_sequence_length:
+            data = torch.nn.functional.interpolate(
+                data, size=self.force_sequence_length, mode='linear', align_corners=False
+            )
+        with torch.no_grad():
+            return self.model(data.to(self.device))
+class EEGFileWatcher:
+    """
+    Watches a CSV file for new data and yields new lines as they appear.
+    """
+    def __init__(self, file_path: str, poll_interval: float = 0.1):
+        """
+        Initialize the file watcher.
+        Args:
+            file_path: Path to the CSV file to watch
+            poll_interval: How often to check for new data (in seconds)
+        """
+        self.file_path = Path(file_path)
+        self.poll_interval = poll_interval
+        self.last_position = 0
+        self.running = False
+        self.thread = None
+        self.queue = queue.Queue()
+        self.header = None
+    def start(self):
+        """Start watching the file in a background thread."""
+        if self.running:
+            return
+        self.running = True
+        self.thread = threading.Thread(target=self._watch_file, daemon=True)
+        self.thread.start()
+    def stop(self):
+        """Stop watching the file."""
+        self.running = False
+        if self.thread:
+            self.thread.join(timeout=1.0)
+    def _watch_file(self):
+        """Background thread that watches the file for changes."""
+        # Wait for the file to exist
+        while self.running and not self.file_path.exists():
+            logger.info(f"Waiting for file {self.file_path} to exist...")
+            time.sleep(self.poll_interval)
+        logger.info(f"File {self.file_path} found, starting to watch")
+        # Keep track of the file position
+        self.last_position = 0
+        # Read header first
+        try:
+            with open(self.file_path, 'r') as f:
+                self.header = f.readline().strip()
+                self.last_position = f.tell()
+                self.queue.put(self.header)
+        except Exception as e:
+            logger.error(f"Error reading header: {e}")
+        while self.running:
+            try:
+                # Check if the file has grown
+                current_size = self.file_path.stat().st_size
+                if current_size > self.last_position:
+                    # Read new data
+                    with open(self.file_path, 'r') as f:
+                        f.seek(self.last_position)
+                        new_data = f.read()
+                        self.last_position = f.tell()
+                    # Process new lines (excluding partial lines)
+                    lines = new_data.split('\n')
+                    if not new_data.endswith('\n'):
+                        # The last line might be incomplete, so we'll read it again next time
+                        self.last_position -= len(lines[-1])
+                        lines = lines[:-1]
+                    # Add complete lines to the queue
+                    for line in lines:
+                        if line.strip():  # Skip empty lines
+                            self.queue.put(line)
+            except Exception as e:
+                logger.error(f"Error watching file: {e}")
+            time.sleep(self.poll_interval)
+    def get_new_lines(self, timeout: Optional[float] = None) -> List[str]:
+        """
+        Get any new lines that have been read since the last call.
+        Args:
+            timeout: How long to wait for new data (in seconds). None means don't wait.
+        Returns:
+            List of new lines (might be empty if no new data)
+        """
+        lines = []
+        try:
+            # Get the first line (with timeout)
+            line = self.queue.get(timeout=timeout)
+            lines.append(line)
+            # Get any remaining lines (without waiting)
+            while True:
+                try:
+                    line = self.queue.get_nowait()
+                    lines.append(line)
+                except queue.Empty:
+                    break
+        except queue.Empty:
+            pass
+        return lines
+class SlidingWindowProcessor:
+    """
+    Processes data using a sliding window approach.
+    """
+    def __init__(
+        self,
+        window_size: int,
+        stride: int,
+        num_channels: int,
+        channel_means: List[float],
+        channel_stds: List[float],
+        normalize: bool = True
+    ):
+        """
+        Initialize the sliding window processor.
+        Args:
+            window_size: Number of data points in each window
+            stride: Number of data points to advance between windows
+            num_channels: Number of data channels
+            channel_means: Mean value for each channel (for normalization)
+            channel_stds: Standard deviation for each channel (for normalization)
+            normalize: Whether to normalize the data
+        """
+        self.window_size = window_size
+        self.stride = stride
+        self.num_channels = num_channels
+        self.channel_means = np.array(channel_means)
+        self.channel_stds = np.array(channel_stds)
+        self.normalize = normalize
+        # Buffer to hold data points
+        self.buffer = []
+        # Current position in the buffer
+        self.current_pos = 0
+    def add_data(self, data_points: List[Dict[str, Union[str, float]]]):
+        """
+        Add new data points to the buffer.
+        Args:
+            data_points: List of data points. Each point should be a dictionary with
+                         'timestamp' and channel values.
+        """
+        self.buffer.extend(data_points)
+    def get_windows(self) -> Generator[Tuple[List[str], np.ndarray], None, None]:
+        """
+        Generate windows from the buffered data using the sliding window approach.
+        Yields:
+            Tuple of (timestamps, data array) for each window
+            data array shape: [num_channels, window_size]
+        """
+        while self.current_pos + self.window_size <= len(self.buffer):
+            # Extract window
+            window = self.buffer[self.current_pos:self.current_pos + self.window_size]
+            # Extract timestamps
+            timestamps = [point['timestamp'] for point in window]
+            # Extract data
+            data = np.zeros((self.num_channels, self.window_size), dtype=np.float32)
+            for i, point in enumerate(window):
+                for c in range(self.num_channels):
+                    channel_key = f'Channel{c+1}'
+                    if channel_key in point:
+                        data[c, i] = point[channel_key]
+            # Normalize if requested
+            if self.normalize:
+                for c in range(self.num_channels):
+                    if self.channel_stds[c] > 0:
+                        data[c] = (data[c] - self.channel_means[c]) / self.channel_stds[c]
+            yield timestamps, data
+            # Advance by stride
+            self.current_pos += self.stride
+        # Remove processed data points that are no longer needed
+        if self.current_pos > 0:
+            # Keep the last (window_size - stride) points for the next window
+            keep_from = max(0, self.current_pos - (self.window_size - self.stride))
+            self.buffer = self.buffer[keep_from:]
+            self.current_pos = max(0, self.current_pos - keep_from)
+class EEGEmbeddingStream:
+    """
+    Stream of EEG embeddings from a live CSV file.
+    """
+    def __init__(
+        self,
+        file_path: str,
+        model_path: str,
+        window_size: int = 256,
+        stride: int = 64,
+        normalizer_params: Dict[str, List[float]] = None,
+        poll_interval: float = 0.1,
+        batch_size: int = 32,
+        normalize: bool = True,
+        device: str = None,
+        start_from_timestamp: str = None,
+        force_sequence_length: int = None  # New parameter
+    ):
+        """
+        Initialize the EEG embedding stream.
+        Args:
+            file_path: Path to the CSV file to watch
+            model_path: Path to the trained model checkpoint
+            window_size: Number of data points in each window
+            stride: Number of data points to advance between windows
+            normalizer_params: Dictionary with 'means' and 'stds' for each channel
+                              If None, default values will be used
+            poll_interval: How often to check for new data (in seconds)
+            batch_size: How many windows to encode at once
+            normalize: Whether to normalize the data
+            device: Device to use for encoding ('cuda' or 'cpu')
+            start_from_timestamp: Only process data from this timestamp onwards
+            force_sequence_length: Force the model to use this sequence length (to match training)
+        """
+        self.file_path = file_path
+        self.poll_interval = poll_interval
+        self.window_size = window_size
+        self.stride = stride
+        self.normalize = normalize
+        self.batch_size = batch_size
+        self.start_from_timestamp = start_from_timestamp
+        # Set default normalizer parameters if not provided
+        if normalizer_params is None:
+            self.channel_means = [-70446.6562, -51197.2070, -42351.2812, -32628.9004, -58139.0547,
+                                  -56271.2852, -48508.2305, -57654.8711, -69949.6484, -49663.8398,
+                                  -43010.7070, -30252.7207, -56295.6250, -56075.9375, -48470.3086,
+                                  -56338.5820]
+            self.channel_stds = [76037.4453, 56048.1445, 71950.6328, 60051.6523, 64877.7422,
+                                 59371.3203, 56742.6055, 62344.4805, 75861.9141, 55614.6055,
+                                 70795.6719, 59312.4180, 64780.2109, 60292.6992, 56598.4609,
+                                 61472.3633]
+        else:
+            self.channel_means = normalizer_params['means']
+            self.channel_stds = normalizer_params['stds']
+        # Determine the number of channels
+        self.num_channels = len(self.channel_means)
+        # Initialize components
+        self.file_watcher = EEGFileWatcher(file_path, poll_interval)
+        self.window_processor = SlidingWindowProcessor(
+            window_size, stride, self.num_channels,
+            self.channel_means, self.channel_stds, normalize
+        )
+        # Set the device
+        if device is None:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.device = torch.device(device)
+        # Load the encoder with the forced sequence length
+        self.encoder = EncoderExtractor(model_path, self.device, force_sequence_length)
+        # CSV header
+        self.header = None
+        # Running flag
+        self.running = False
+        # Statistics
+        self.stats = {
+            'windows_processed': 0,
+            'start_time': None,
+            'last_timestamp': None
+        }
+    def start(self):
+        """Start the embedding stream."""
+        if self.running:
+            return
+        self.running = True
+        self.stats['start_time'] = time.time()
+        self.file_watcher.start()
+    def stop(self):
+        """Stop the embedding stream."""
+        self.running = False
+        self.file_watcher.stop()
+        if self.stats['start_time'] is not None:
+            elapsed = time.time() - self.stats['start_time']
+            windows_processed = self.stats['windows_processed']
+            if windows_processed > 0 and elapsed > 0:
+                rate = windows_processed / elapsed
+                logger.info(f"Processed {windows_processed} windows in {elapsed:.2f}s ({rate:.2f} windows/s)")
+    def _parse_csv_line(self, line: str) -> Dict[str, Union[str, float]]:
+        """
+        Parse a CSV line into a data point.
+        Args:
+            line: CSV line
+        Returns:
+            Dictionary with timestamp and channel values, or None if header
+        """
+        if not self.header:
+            # First line is the header
+            self.header = line.split(',')
+            logger.info(f"CSV header: {self.header}")
+            return None
+        values = line.split(',')
+        if len(values) != len(self.header):
+            logger.warning(f"Line has wrong number of values: {line}")
+            return None
+        data_point = {}
+        for i, column in enumerate(self.header):
+            if i == 0:
+                # Timestamp column
+                data_point['timestamp'] = values[i]
+                # Skip if before start_from_timestamp
+                if self.start_from_timestamp and values[i] < self.start_from_timestamp:
+                    return None
+            else:
+                # Channel column
+                try:
+                    data_point[column] = float(values[i])
+                except ValueError:
+                    logger.warning(f"Could not parse value {values[i]} as float for column {column}")
+                    data_point[column] = 0.0
+        return data_point
+    def get_embeddings(self, timeout: Optional[float] = None) -> Generator[Dict[str, Any], None, None]:
+        """
+        Get embeddings for new data.
+        Args:
+            timeout: How long to wait for new data (in seconds). None means don't wait.
+        Yields:
+            Dictionary with window information and embedding
+        """
+        if not self.running:
+            self.start()
+        # Get new lines from the file
+        new_lines = self.file_watcher.get_new_lines(timeout)
+        if not new_lines:
+            return
+        # Parse CSV lines
+        data_points = []
+        for line in new_lines:
+            data_point = self._parse_csv_line(line)
+            if data_point:
+                data_points.append(data_point)
+                self.stats['last_timestamp'] = data_point['timestamp']
+        if not data_points:
+            return
+        # Add to the window processor
+        self.window_processor.add_data(data_points)
+        # Get windows and batch them for embedding
+        windows = list(self.window_processor.get_windows())
+        if not windows:
+            return
+        for batch_start in range(0, len(windows), self.batch_size):
+            batch_end = min(batch_start + self.batch_size, len(windows))
+            batch = windows[batch_start:batch_end]
+            # Extract timestamps and data
+            batch_timestamps = [window[0] for window in batch]
+            batch_data = [window[1] for window in batch]
+            # Convert to tensors
+            batch_tensor = torch.tensor(np.array(batch_data), dtype=torch.float32)
+            # Generate embeddings
+            embeddings = self.encoder.embed(batch_tensor)
+            # Convert to numpy and yield
+            embeddings_np = embeddings.cpu().numpy()
+            for i in range(len(batch)):
+                self.stats['windows_processed'] += 1
+                yield {
+                    'start_timestamp': batch_timestamps[i][0],
+                    'end_timestamp': batch_timestamps[i][-1],
+                    'embedding': embeddings_np[i],
+                    'window_index': self.stats['windows_processed'] - 1
+                }
+    def get_streaming_embeddings(self, callback: Optional[Callable[[Dict[str, Any]], None]] = None) -> Generator[Dict[str, Any], None, None]:
+        """
+        Continuously generate embeddings and call the callback function with each one.
+        Args:
+            callback: Function to call with each embedding. If None, embeddings are yielded.
+        Yields:
+            If no callback is provided, yields dictionaries with window information and embedding
+        """
+        self.start()
+        try:
+            while self.running:
+                any_embeddings = False
+                for embedding in self.get_embeddings(timeout=self.poll_interval):
+                    any_embeddings = True
+                    if callback:
+                        callback(embedding)
+                    else:
+                        yield embedding
+                if not any_embeddings:
+                    # No new embeddings, just wait a bit
+                    time.sleep(self.poll_interval)
+        finally:
+            self.stop()
+    def get_stats(self) -> Dict[str, Any]:
+        """
+        Get statistics about the streaming process.
+        Returns:
+            Dictionary with statistics
+        """
+        stats = dict(self.stats)
+        if stats['start_time'] is not None:
+            stats['elapsed'] = time.time() - stats['start_time']
+            if stats['windows_processed'] > 0 and stats['elapsed'] > 0:
+                stats['windows_per_second'] = stats['windows_processed'] / stats['elapsed']
+        return stats
+# Example usage
+def example():
+    def handle_embedding(embedding):
+        """Callback function to handle new embeddings."""
+        start_time = embedding['start_timestamp']
+        end_time = embedding['end_timestamp']
+        embedding_data = embedding['embedding']
+        print(f"Got embedding for window from {start_time} to {end_time}")
+        print(f"Embedding shape: {embedding_data.shape}")
+        print(f"First few values: {embedding_data.flatten()[:5]}")
+    # Create the embedding stream
+    stream = EEGEmbeddingStream(
+        file_path="eeg_data.csv",
+        model_path="models/eeg_autoencoder.pth",
+        window_size=256,  # Number of data points in each window
+        stride=128,       # How much to advance between windows
+        poll_interval=0.5  # Check for new data every 0.5 seconds
+    )
+    print("Starting embedding stream...")
+    print("Press Ctrl+C to stop")
+    try:
+        # Method 1: Using callback
+        stream.get_streaming_embeddings(callback=handle_embedding)
+        # Method 2: Using generator
+        # for embedding in stream.get_streaming_embeddings():
+        #     handle_embedding(embedding)
+    except KeyboardInterrupt:
+        print("\nStopping...")
+    finally:
+        stream.stop()
+        print("Stopped")
+if __name__ == "__main__":
+    example()

embed.py ADDED Viewed

	@@ -0,0 +1,424 @@

+#!/usr/bin/env python3
+"""
+Text embedding script with SQLite storage (using numpy buffers)
+Now with flexible text splitting modes!
+Usage: python embed_flex.py <directory_path> <db_path> [--split-mode MODE]
+Split modes:
+  - line (default): Each non-empty line becomes one embedding
+  - block: Double-newline separated blocks (paragraphs)
+  - sentence: Split on sentence boundaries (., !, ?)
+  - chunk: Fixed token-ish chunks with overlap (for long docs)
+"""
+import os
+import sys
+import argparse
+import sqlite3
+import numpy as np
+from tqdm import tqdm
+from transformers import AutoModel, AutoTokenizer
+import torch
+import gc
+import random
+import re
+INITIAL_BATCH_SIZE = 128
+MIN_BATCH_SIZE = 1
+SHUFFLE_SEED = 42
+# Chunk mode settings
+DEFAULT_CHUNK_SIZE = 512  # characters
+DEFAULT_CHUNK_OVERLAP = 64
+def create_index_if_possible(cursor):
+    try:
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_content ON messages(content)
+        """)
+    except sqlite3.OperationalError:
+        pass
+def get_existing_content(cursor):
+    try:
+        cursor.execute("SELECT content FROM messages")
+        return {row[0] for row in cursor.fetchall()}
+    except sqlite3.OperationalError:
+        return set()
+def clear_gpu_memory():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+# =============================================================================
+# SPLITTING STRATEGIES
+# =============================================================================
+def split_by_lines(text):
+    """Original behavior: each non-empty line is one unit."""
+    lines = []
+    for line in text.split('\n'):
+        line = line.strip()
+        if line:
+            lines.append(line)
+    return lines
+def split_by_blocks(text):
+    blocks = re.split(r'\n\s*\n+', text)
+    result = []
+    for block in blocks:
+        cleaned = ' '.join(block.split())
+        if cleaned:
+            result.append(cleaned)
+    return result
+def split_by_sentences(text):
+    """
+    Split on sentence boundaries.
+    Handles common abbreviations somewhat gracefully.
+    """
+    # First normalize whitespace
+    text = ' '.join(text.split())
+    # Sentence-ending pattern (handles ., !, ? followed by space and capital or end)
+    # This is imperfect but reasonable for most text
+    pattern = r'(?<=[.!?])\s+(?=[A-Z])'
+    sentences = re.split(pattern, text)
+    result = []
+    for sent in sentences:
+        sent = sent.strip()
+        if sent:
+            result.append(sent)
+    return result
+def split_by_chunks(text, chunk_size=DEFAULT_CHUNK_SIZE, overlap=DEFAULT_CHUNK_OVERLAP):
+    """
+    Fixed-size character chunks with overlap.
+    Good for long documents where you want sliding window coverage.
+    """
+    # Normalize whitespace
+    text = ' '.join(text.split())
+    if len(text) <= chunk_size:
+        return [text] if text else []
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunk = text[start:end]
+        # Try to break at word boundary if not at end
+        if end < len(text):
+            last_space = chunk.rfind(' ')
+            if last_space > chunk_size // 2:  # Only if we're not losing too much
+                chunk = chunk[:last_space]
+                end = start + last_space
+        chunk = chunk.strip()
+        if chunk:
+            chunks.append(chunk)
+        # Move forward with overlap
+        start = end - overlap
+        if start <= chunks[-1] if chunks else 0:  # Prevent infinite loop
+            start = end
+    return chunks
+def get_splitter(mode, chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP):
+    """Return the appropriate splitting function."""
+    if mode == 'line':
+        return split_by_lines
+    elif mode == 'block':
+        return split_by_blocks
+    elif mode == 'sentence':
+        return split_by_sentences
+    elif mode == 'chunk':
+        return lambda text: split_by_chunks(text, chunk_size, chunk_overlap)
+    else:
+        raise ValueError(f"Unknown split mode: {mode}")
+# =============================================================================
+# PROCESSING
+# =============================================================================
+def process_batch(model, batch_lines, cursor, task="text-matching"):
+    try:
+        with torch.no_grad():
+            batch_embeddings = model.encode(batch_lines, task=task, device="cuda")
+        for line_text, embedding in zip(batch_lines, batch_embeddings):
+            try:
+                cursor.execute(
+                    "INSERT INTO messages (content, role) VALUES (?, ?)",
+                    (line_text, "system")
+                )
+                message_id = cursor.lastrowid
+                if torch.is_tensor(embedding):
+                    embedding_np = embedding.cpu().numpy()
+                elif not isinstance(embedding, np.ndarray):
+                    embedding_np = np.array(embedding)
+                else:
+                    embedding_np = embedding
+                embedding_blob = embedding_np.astype(np.float32).tobytes()
+                cursor.execute(
+                    "INSERT INTO embeddings (message_id, embedding) VALUES (?, ?)",
+                    (message_id, embedding_blob)
+                )
+            except sqlite3.Error as e:
+                print(f"Error processing entry: {e}")
+                continue
+        return True
+    except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
+        if "out of memory" in str(e).lower():
+            clear_gpu_memory()
+            return False
+        else:
+            raise
+def convert_existing_pickles(cursor, conn):
+    """Convert any existing pickle embeddings to numpy buffers"""
+    import pickle
+    def is_numpy_buffer(blob):
+        try:
+            np_array = np.frombuffer(blob, dtype=np.float32)
+            if np_array.ndim >= 1 and len(np_array) > 0:
+                return True
+        except Exception:
+            pass
+        return False
+    def unpickle_to_numpy(blob):
+        try:
+            pickled_obj = pickle.loads(blob)
+            if isinstance(pickled_obj, np.ndarray):
+                return pickled_obj
+            elif torch.is_tensor(pickled_obj):
+                return pickled_obj.cpu().numpy()
+            else:
+                return np.array(pickled_obj)
+        except Exception:
+            return None
+    cursor.execute("SELECT COUNT(*) FROM embeddings")
+    total_embeddings = cursor.fetchone()[0]
+    if total_embeddings == 0:
+        return
+    print(f"Checking {total_embeddings} existing embeddings for pickle->numpy conversion...")
+    cursor.execute("SELECT message_id, embedding FROM embeddings")
+    embeddings = cursor.fetchall()
+    converted_count = 0
+    for message_id, embedding_blob in embeddings:
+        if not is_numpy_buffer(embedding_blob):
+            numpy_array = unpickle_to_numpy(embedding_blob)
+            if numpy_array is not None:
+                np_buffer = numpy_array.astype(np.float32).tobytes()
+                cursor.execute(
+                    "UPDATE embeddings SET embedding = ? WHERE message_id = ?",
+                    (np_buffer, message_id)
+                )
+                converted_count += 1
+    if converted_count > 0:
+        conn.commit()
+        print(f"Converted {converted_count} pickle embeddings to numpy buffers")
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate embeddings for text files with flexible splitting modes',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Split Modes:
+  line      Each non-empty line = one embedding (default, original behavior)
+  block     Double-newline separated paragraphs = one embedding each
+  sentence  Split on sentence boundaries (., !, ?)
+  chunk     Fixed-size character chunks with overlap (good for long docs)
+Examples:
+  %(prog)s ~/docs embeddings.db                          # line mode (default)
+  %(prog)s ~/docs embeddings.db --split-mode block       # paragraph mode
+  %(prog)s ~/docs embeddings.db --split-mode sentence    # sentence mode
+  %(prog)s ~/docs embeddings.db --split-mode chunk --chunk-size 1024 --chunk-overlap 128
+        """
+    )
+    parser.add_argument('directory',
+                        help='Directory containing .txt files to process')
+    parser.add_argument('database',
+                        help='SQLite database path (will be created if not exists)')
+    parser.add_argument('--split-mode', '-s',
+                        choices=['line', 'block', 'sentence', 'chunk'],
+                        default='line',
+                        help='Text splitting strategy (default: line)')
+    parser.add_argument('--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
+                        help=f'Character chunk size for chunk mode (default: {DEFAULT_CHUNK_SIZE})')
+    parser.add_argument('--chunk-overlap', type=int, default=DEFAULT_CHUNK_OVERLAP,
+                        help=f'Overlap between chunks (default: {DEFAULT_CHUNK_OVERLAP})')
+    parser.add_argument('--batch-size', type=int, default=INITIAL_BATCH_SIZE,
+                        help=f'Initial batch size (default: {INITIAL_BATCH_SIZE})')
+    parser.add_argument('--task', default='text-matching',
+                        help='Encoding task (default: text-matching)')
+    parser.add_argument('--model', default='jinaai/jina-embeddings-v3',
+                        help='Model name (default: jinaai/jina-embeddings-v3)')
+    parser.add_argument('--skip-conversion', action='store_true',
+                        help='Skip checking/converting existing pickle embeddings')
+    args = parser.parse_args()
+    directory_path = os.path.expanduser(args.directory)
+    db_path = os.path.expanduser(args.database)
+    if not os.path.isdir(directory_path):
+        print(f"Error: Directory '{directory_path}' does not exist")
+        sys.exit(1)
+    print(f"Processing directory: {directory_path}")
+    print(f"Database: {db_path}")
+    print(f"Split mode: {args.split_mode}")
+    if args.split_mode == 'chunk':
+        print(f"Chunk size: {args.chunk_size}, overlap: {args.chunk_overlap}")
+    print(f"Initial batch size: {args.batch_size}")
+    # Get splitter function
+    splitter = get_splitter(args.split_mode, args.chunk_size, args.chunk_overlap)
+    # Initialize model
+    print(f"Loading model: {args.model}")
+    model = AutoModel.from_pretrained(args.model, trust_remote_code=True).cuda()
+    model.eval()
+    # Set up SQLite
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS messages (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            content TEXT,
+            role TEXT
+        )
+    """)
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS embeddings (
+            message_id INTEGER PRIMARY KEY,
+            embedding BLOB,
+            FOREIGN KEY (message_id) REFERENCES messages(message_id) ON DELETE CASCADE
+        )
+    """)
+    conn.commit()
+    create_index_if_possible(cursor)
+    conn.commit()
+    if not args.skip_conversion:
+        convert_existing_pickles(cursor, conn)
+    existing_content = get_existing_content(cursor)
+    print(f"Already processed: {len(existing_content)} entries")
+    # Collect all text units using the selected splitter
+    all_units = []
+    txt_files = [f for f in os.listdir(directory_path) if f.lower().endswith(".txt")]
+    if not txt_files:
+        print(f"Warning: No .txt files found in {directory_path}")
+        conn.close()
+        return
+    print(f"Found {len(txt_files)} .txt files")
+    for filename in txt_files:
+        filepath = os.path.join(directory_path, filename)
+        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+            units = splitter(content)
+            all_units.extend(units)
+    print(f"Total units from source ({args.split_mode} mode): {len(all_units)}")
+    # Deterministic shuffle
+    random.seed(SHUFFLE_SEED)
+    random.shuffle(all_units)
+    # Filter out already processed
+    new_units = [u for u in all_units if u not in existing_content]
+    print(f"Remaining to process: {len(new_units)}")
+    if not new_units:
+        print("Nothing new to process.")
+        conn.close()
+        return
+    # Process with dynamic batch sizing
+    batch_size = args.batch_size
+    total = len(new_units)
+    task = args.task
+    idx = 0
+    processed_count = 0
+    with tqdm(total=total, desc="Processing") as pbar:
+        while idx < total:
+            end_idx = min(idx + batch_size, total)
+            batch = new_units[idx:end_idx]
+            success = process_batch(model, batch, cursor, task)
+            if success:
+                try:
+                    conn.commit()
+                except sqlite3.Error as e:
+                    print(f"Error committing batch: {e}")
+                batch_processed = len(batch)
+                pbar.update(batch_processed)
+                processed_count += batch_processed
+                idx = end_idx
+                if batch_size < args.batch_size and processed_count % (batch_size * 10) == 0:
+                    batch_size = min(batch_size * 2, args.batch_size)
+            else:
+                if batch_size > MIN_BATCH_SIZE:
+                    batch_size = max(batch_size // 2, MIN_BATCH_SIZE)
+                    print(f"\nOOM - batch size -> {batch_size}")
+                else:
+                    print(f"\nSkipping: {batch[0][:100]}...")
+                    idx += 1
+                    pbar.update(1)
+                    processed_count += 1
+    conn.close()
+    print(f"\nProcessed {processed_count:,} entries total.")
+    print("All embeddings stored as numpy buffers (float32).")
+if __name__ == "__main__":
+    main()

morphism.py ADDED Viewed

	@@ -0,0 +1,436 @@

+#!/usr/bin/env python3
+"""
+morphism — EEG-to-text semantic search
+Usage:
+    morphism record [options]
+    morphism index create|info|rebuild [options]
+    morphism decode [options]
+"""
+import sys
+import os
+import argparse
+from retrieval import FloodMode, DriftMode, FocusMode, LayeredMode
+def cmd_record(args):
+    """Record EEG data from OpenBCI Cyton+Daisy"""
+    from cyton import (
+        init_board, set_sample_rate, read_complete_packet, process_packet,
+        start_sd_recording, stop_sd_recording, create_ssh_connection, sd_record
+    )
+    import serial, time, io
+    from datetime import datetime
+    if args.sd:
+        sd_record(args.port, args.duration, args.sample_rate)
+        return
+    filename = args.output
+    if filename is None:
+        filename = f"openbci_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
+    ser = serial.Serial(args.port, 115200)
+    time.sleep(2)
+    init_board(ser)
+    if args.sample_rate != 1000:
+        set_sample_rate(ser, args.sample_rate)
+    ssh, sftp, remote_file = None, None, None
+    if args.remote:
+        ssh = create_ssh_connection()
+        if not ssh:
+            print("SSH connection failed.")
+            return
+        sftp = ssh.open_sftp()
+        remote_file = sftp.open(filename, 'w')
+    header = "Timestamp," + ",".join(f"Channel{i+1}" for i in range(16)) + "\n"
+    if args.remote:
+        remote_file.write(header)
+    else:
+        with open(filename, 'w') as f:
+            f.write(header)
+    ser.write(b'b')
+    time.sleep(0.5)
+    ser.reset_input_buffer()
+    print(f"Recording to {filename} — Ctrl+C to stop")
+    pkt_count = 0
+    t0 = time.time()
+    buf = io.StringIO()
+    last_flush = time.time()
+    try:
+        while True:
+            p1 = read_complete_packet(ser)
+            if not p1:
+                continue
+            p2 = read_complete_packet(ser)
+            if not p2:
+                continue
+            d1, d2 = process_packet(p1), process_packet(p2)
+            if not (d1 and d2):
+                continue
+            pkt_count += 1
+            ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+            line = ts + "," + ",".join(f"{x:.6f}" for x in d1 + d2) + "\n"
+            if args.remote:
+                buf.write(line)
+                if time.time() - last_flush >= 0.1:
+                    remote_file.write(buf.getvalue())
+                    buf = io.StringIO()
+                    last_flush = time.time()
+            else:
+                with open(filename, 'a') as f:
+                    f.write(line)
+            if pkt_count % 125 == 0:
+                rate = pkt_count / (time.time() - t0)
+                print(f"\r  {rate:.1f} Hz, {pkt_count} packets", end='')
+            if ser.in_waiting > 1000:
+                ser.reset_input_buffer()
+    except KeyboardInterrupt:
+        ser.write(b's')
+        ser.close()
+        if args.remote:
+            if buf.getvalue():
+                remote_file.write(buf.getvalue())
+            remote_file.close()
+            sftp.close()
+            ssh.close()
+        elapsed = time.time() - t0
+        print(f"\n\nDone — {pkt_count} packets in {elapsed:.1f}s ({pkt_count/elapsed:.1f} Hz)")
+        print(f"Saved to {filename}")
+def cmd_index(args):
+    """Manage the text embedding index"""
+    from embed import (
+        get_splitter, process_batch, create_index_if_possible,
+        get_existing_content, INITIAL_BATCH_SIZE, MIN_BATCH_SIZE, SHUFFLE_SEED
+    )
+    import sqlite3, numpy as np, random
+    from tqdm import tqdm
+    db_path = os.path.expanduser(args.db)
+    index_prefix = args.index
+    if args.action == 'info':
+        if not os.path.exists(db_path):
+            print(f"No database at {db_path}")
+            return
+        conn = sqlite3.connect(db_path)
+        c = conn.cursor()
+        c.execute("SELECT COUNT(*) FROM messages")
+        msg_count = c.fetchone()[0]
+        c.execute("SELECT COUNT(*) FROM embeddings")
+        emb_count = c.fetchone()[0]
+        conn.close()
+        index_exists = os.path.exists(f"{index_prefix}.index")
+        print(f"Database:    {db_path}")
+        print(f"Messages:    {msg_count:,}")
+        print(f"Embeddings:  {emb_count:,}")
+        print(f"FAISS index: {'exists' if index_exists else 'not built'} ({index_prefix}.index)")
+        return
+    if args.action in ('create', 'rebuild'):
+        corpus = os.path.expanduser(args.corpus)
+        if not os.path.isdir(corpus):
+            print(f"Not a directory: {corpus}")
+            sys.exit(1)
+        splitter = get_splitter(args.split_mode, args.chunk_size, args.chunk_overlap)
+        print(f"Loading model: {args.model}")
+        from transformers import AutoModel
+        model = AutoModel.from_pretrained(args.model, trust_remote_code=True).cuda()
+        model.eval()
+        conn = sqlite3.connect(db_path)
+        c = conn.cursor()
+        if args.action == 'rebuild':
+            print("Dropping existing data...")
+            c.execute("DELETE FROM embeddings")
+            c.execute("DELETE FROM messages")
+            conn.commit()
+        c.execute("""CREATE TABLE IF NOT EXISTS messages (
+            id INTEGER PRIMARY KEY AUTOINCREMENT, content TEXT, role TEXT)""")
+        c.execute("""CREATE TABLE IF NOT EXISTS embeddings (
+            message_id INTEGER PRIMARY KEY, embedding BLOB,
+            FOREIGN KEY (message_id) REFERENCES messages(message_id) ON DELETE CASCADE)""")
+        conn.commit()
+        create_index_if_possible(c)
+        conn.commit()
+        existing = get_existing_content(c)
+        print(f"Already indexed: {len(existing):,}")
+        txt_files = [f for f in os.listdir(corpus) if f.lower().endswith('.txt')]
+        if not txt_files:
+            print(f"No .txt files in {corpus}")
+            conn.close()
+            return
+        units = []
+        for fn in txt_files:
+            with open(os.path.join(corpus, fn), 'r', encoding='utf-8', errors='ignore') as f:
+                units.extend(splitter(f.read()))
+        random.seed(SHUFFLE_SEED)
+        random.shuffle(units)
+        new_units = [u for u in units if u not in existing]
+        print(f"New units to embed: {len(new_units):,}")
+        if not new_units:
+            print("Nothing new.")
+            conn.close()
+            return
+        batch_size = args.batch_size
+        idx = 0
+        processed = 0
+        with tqdm(total=len(new_units), desc="Embedding") as pbar:
+            while idx < len(new_units):
+                batch = new_units[idx:idx + batch_size]
+                ok = process_batch(model, batch, c, args.task)
+                if ok:
+                    conn.commit()
+                    pbar.update(len(batch))
+                    processed += len(batch)
+                    idx += len(batch)
+                else:
+                    if batch_size > MIN_BATCH_SIZE:
+                        batch_size = max(batch_size // 2, MIN_BATCH_SIZE)
+                        print(f"\nOOM — batch size → {batch_size}")
+                    else:
+                        idx += 1
+                        pbar.update(1)
+                        processed += 1
+        conn.close()
+        print(f"Embedded {processed:,} units.")
+        print("Building FAISS index...")
+        _build_faiss_index(db_path, index_prefix)
+        print("Done.")
+def _build_faiss_index(db_path, index_prefix):
+    """Build FAISS index from the embeddings database"""
+    import sqlite3, numpy as np
+    from decode import EmbeddingIndex
+    conn = sqlite3.connect(db_path)
+    c = conn.cursor()
+    c.execute("SELECT message_id, embedding FROM embeddings ORDER BY message_id")
+    embeddings, ids = [], []
+    for mid, blob in c.fetchall():
+        embeddings.append(np.frombuffer(blob, dtype=np.float32))
+        ids.append(mid)
+    conn.close()
+    if not embeddings:
+        print("  No embeddings found.")
+        return
+    embeddings = np.vstack(embeddings)
+    print(f"  {len(embeddings):,} vectors, dim={embeddings.shape[1]}")
+    idx = EmbeddingIndex(dim=embeddings.shape[1])
+    idx.add_embeddings(embeddings, ids)
+    idx.save(index_prefix)
+    conn2 = sqlite3.connect(db_path)
+    c2 = conn2.cursor()
+    c2.execute("SELECT COUNT(*) FROM embeddings")
+    count = c2.fetchone()[0]
+    c2.execute("SELECT MAX(message_id) FROM embeddings")
+    max_id = c2.fetchone()[0]
+    conn2.close()
+    np.savez(f"{index_prefix}_metadata.npz", count=count, max_message_id=max_id)
+def cmd_decode(args):
+    """Run EEG → text decoding"""
+    from decode import EEGSemanticProcessor
+    processor = EEGSemanticProcessor(
+        autoencoder_model_path=args.autoencoder,
+        semantic_model_path=args.semantic,
+        nexus_db_path=args.db,
+        embeddings_db_path=args.db,
+        index_path=args.index,
+        eeg_file_path=args.eeg,
+        window_size=args.window_size,
+        stride=args.stride,
+        batch_size=args.batch_size,
+        device=args.device,
+        search_k=args.search_k,
+        final_k=args.final_k,
+        use_raw_eeg=args.raw_eeg,
+        input_dim_override=args.input_dim,
+        save_vectors=args.save_vectors,
+        vector_output_path=args.vector_output,
+        last_n_messages=args.last_n,
+    )
+    modes = {
+        'flood': lambda: FloodMode(processor.embedding_index, processor.nexus_conn,
+                                   search_k=args.search_k, final_k=args.final_k,
+                                   last_n=args.last_n),
+        'drift': lambda: DriftMode(processor.embedding_index, processor.nexus_conn,
+                                   search_k=64),
+        'focus': lambda: FocusMode(processor.embedding_index, processor.nexus_conn,
+                                   search_k=48),
+        'layered': lambda: LayeredMode(processor.embedding_index, processor.nexus_conn),
+    }
+    mode = modes[args.mode]()
+    processor.eeg_stream.start()
+    try:
+        consecutive_errors = 0
+        while True:
+            try:
+                for embedding_data in processor.eeg_stream.get_embeddings(timeout=0.5):
+                    try:
+                        semantic_embedding = processor.process_eeg_embedding(
+                            embedding_data['embedding'])
+                        if processor.save_vectors:
+                            embedding_np = semantic_embedding.detach().cpu().numpy()
+                            processor.vectors_list.append(embedding_np)
+                            processor.timestamps.append({
+                                'start': embedding_data['start_timestamp'],
+                                'end': embedding_data['end_timestamp']
+                            })
+                            if len(processor.vectors_list) % 100 == 0:
+                                import logging
+                                logging.getLogger("EEGSemanticStream").info(
+                                    f"Collected {len(processor.vectors_list)} vectors")
+                            continue
+                        lines = mode.step(semantic_embedding)
+                        if lines:
+                            output = "\n".join(lines)
+                            print(output)
+                            if processor.log_file:
+                                processor.log_file.write(output + "\n")
+                                processor.log_file.flush()
+                        consecutive_errors = 0
+                    except Exception as e:
+                        import sys
+                        print(f"Error: {e}", file=sys.stderr)
+                        consecutive_errors += 1
+                        if consecutive_errors >= 5:
+                            raise RuntimeError("Too many consecutive errors")
+                import time
+                time.sleep(0.01)
+            except Exception as e:
+                if "Too many" in str(e):
+                    raise
+                import sys, time
+                print(f"Error: {e}", file=sys.stderr)
+                consecutive_errors += 1
+                if consecutive_errors >= 5:
+                    raise
+                time.sleep(1)
+    except KeyboardInterrupt:
+        pass
+    except Exception as e:
+        import sys
+        print(f"Fatal: {e}", file=sys.stderr)
+    finally:
+        if processor.save_vectors and processor.vectors_list:
+            processor.save_vectors_to_disk()
+        processor.eeg_stream.stop()
+def main():
+    p = argparse.ArgumentParser(
+        prog='morphism',
+        description='EEG-to-text semantic search',
+    )
+    sub = p.add_subparsers(dest='command')
+    # --- record ---
+    rec = sub.add_parser('record', help='Record EEG from OpenBCI Cyton+Daisy')
+    rec.add_argument('--port', '-p', default='/dev/ttyUSB0')
+    rec.add_argument('--output', '-o', default=None)
+    rec.add_argument('--sample-rate', type=int, default=1000)
+    rec.add_argument('--sd', action='store_true', help='Record to SD card')
+    rec.add_argument('--duration', default='G')
+    rec.add_argument('--remote', action='store_true', help='Stream via SSH')
+    # --- index ---
+    idx = sub.add_parser('index', help='Manage the text embedding index')
+    idx.add_argument('action', choices=['create', 'info', 'rebuild'])
+    idx.add_argument('--corpus', '-c', default=None)
+    idx.add_argument('--db', default='morphism.db')
+    idx.add_argument('--index', default='morphism')
+    idx.add_argument('--split-mode', default='line',
+                     choices=['line', 'block', 'sentence', 'chunk'])
+    idx.add_argument('--chunk-size', type=int, default=512)
+    idx.add_argument('--chunk-overlap', type=int, default=64)
+    idx.add_argument('--batch-size', type=int, default=128)
+    idx.add_argument('--task', default='text-matching')
+    idx.add_argument('--model', default='jinaai/jina-embeddings-v3')
+    # --- decode ---
+    dec = sub.add_parser('decode', help='Run EEG → text decoding')
+    dec.add_argument('--mode', default='flood', choices=['flood', 'drift', 'focus', 'layered'])
+    dec.add_argument('--eeg', '-f', required=True)
+    dec.add_argument('--autoencoder', '-a', required=True)
+    dec.add_argument('--semantic', '-s', required=True)
+    dec.add_argument('--db', default='morphism.db')
+    dec.add_argument('--index', default='morphism')
+    dec.add_argument('--window-size', type=int, default=624)
+    dec.add_argument('--stride', type=int, default=32)
+    dec.add_argument('--batch-size', type=int, default=32)
+    dec.add_argument('--device', default=None)
+    dec.add_argument('--search-k', type=int, default=1024)
+    dec.add_argument('--final-k', type=int, default=1024)
+    dec.add_argument('--last-n', type=int, default=128)
+    dec.add_argument('--raw-eeg', action='store_true')
+    dec.add_argument('--input-dim', type=int, default=None)
+    dec.add_argument('--save-vectors', action='store_true')
+    dec.add_argument('--vector-output', default='semantic_vectors.npz')
+    args = p.parse_args()
+    if args.command is None:
+        p.print_help()
+        sys.exit(0)
+    if args.command == 'record':
+        cmd_record(args)
+    elif args.command == 'index':
+        if args.action in ('create', 'rebuild') and not args.corpus:
+            print("--corpus is required for create/rebuild")
+            sys.exit(1)
+        cmd_index(args)
+    elif args.command == 'decode':
+        cmd_decode(args)
+if __name__ == '__main__':
+    main()