Feat - Block like transformer structure

Files changed (10) hide show

.gitignore +2 -0
README.md +389 -377
blocks.py +446 -0
config.json +2 -0
configuration_autoencoder.py +122 -21
model.safetensors +0 -0
modeling_autoencoder.py +122 -748
preprocessing.py +457 -0
template.py +382 -0
utils.py +69 -0

.gitignore CHANGED Viewed

@@ -8,3 +8,5 @@ wheels/
 # Virtual environments
 .venv

 # Virtual environments
 .venv
+tests/*

README.md CHANGED Viewed

@@ -12,491 +12,503 @@ tags:
   - scaler
 ---
-# Autoencoder Implementation for Hugging Face Transformers
-A complete autoencoder implementation that integrates seamlessly with the Hugging Face Transformers ecosystem, providing all the standard functionality you expect from transformer models.
-### Install-and-Use from the Hub (code repo)
-If you want to use the implementation directly from the Hub code repository (without a packaged pip install), you can download the repo and add it to `sys.path`:
 ```python
 from huggingface_hub import snapshot_download
 import sys, torch
-# 1) Download the code+weights for your repo “as is”
 repo_dir = snapshot_download(
     repo_id="amaye15/autoencoder",
     repo_type="model",
-    allow_patterns=["*.py", "config.json", "*.safetensors"],  # note the * wildcards
 )
-# 2) Add to import path so plain imports work
 sys.path.append(repo_dir)
-# 3) Import your classes from the repo code
-from configuration_autoencoder import AutoencoderConfig
 from modeling_autoencoder import AutoencoderForReconstruction
-# 4) Load the placeholder weights from the local folder (no internet, no code refresh)
 model = AutoencoderForReconstruction.from_pretrained(repo_dir)
-# 5) Quick smoke test
 x = torch.randn(8, 20)
 out = model(input_values=x)
 print("latent:", out.last_hidden_state.shape, "reconstructed:", out.reconstructed.shape)
 ```
-## 🚀 Features
-- **Full Hugging Face Integration**: Compatible with `AutoModel`, `AutoConfig`, and `AutoTokenizer` patterns
-- **Standard Training Workflows**: Works with `Trainer`, `TrainingArguments`, and all HF training utilities
-- **Model Hub Compatible**: Save and share models on Hugging Face Hub with `push_to_hub()`
-- **Flexible Architecture**: Configurable encoder-decoder architecture with various activation functions
-- **Multiple Loss Functions**: Support for MSE, BCE, L1, Huber, Smooth L1, KL Divergence, Cosine, Focal, Dice, Tversky, SSIM, and Perceptual loss
-- **Multiple Autoencoder Types (7)**: Classic, Variational (VAE), Beta-VAE, Denoising, Sparse, Contractive, and Recurrent autoencoders
-- **Extended Activation Functions**: 18+ activation functions including ReLU, GELU, Swish, Mish, ELU, and more
-- **Learnable Preprocessing**: Neural Scaler, Normalizing Flow, MinMax Scaler (learnable), Robust Scaler (learnable), and Yeo-Johnson preprocessors (2D and 3D tensors)
-- **Extensible Design**: Easy to extend for new autoencoder variants and custom loss functions
-- **Production Ready**: Proper serialization, checkpointing, and inference support
-## 🏗️ Architecture
-The implementation consists of three main components:
-### 1. AutoencoderConfig
-Configuration class that inherits from `PretrainedConfig`:
-- Defines model architecture parameters
-- Handles validation and serialization
-- Enables `AutoConfig.from_pretrained()` functionality
-### 2. AutoencoderModel
-Base model class that inherits from `PreTrainedModel`:
-- Implements encoder-decoder architecture
-- Provides latent space representation
-- Returns structured outputs with `AutoencoderOutput`
-### 3. AutoencoderForReconstruction
-Task-specific model for reconstruction:
-- Adds reconstruction loss calculation
-- Compatible with `Trainer` for easy training
-- Returns `AutoencoderForReconstructionOutput` with loss
-## 🔧 Quick Start
-### Basic Usage
 ```python
-from configuration_autoencoder import AutoencoderConfig
-from modeling_autoencoder import AutoencoderForReconstruction
-import torch
-# Create configuration
-config = AutoencoderConfig(
-    input_dim=784,              # Input dimensionality (e.g., 28x28 images flattened)
-    hidden_dims=[512, 256],     # Encoder hidden layers
-    latent_dim=64,              # Latent space dimension
-    activation="gelu",          # Activation function (18+ options available)
-    reconstruction_loss="mse",  # Loss function (12+ options available)
-    autoencoder_type="classic", # Autoencoder type (7 types available)
-    # Optional learnable preprocessing
-    use_learnable_preprocessing=True,
-    preprocessing_type="neural_scaler",  # or "normalizing_flow", "minmax_scaler", "robust_scaler", "yeo_johnson"
-)
-# Create model
-model = AutoencoderForReconstruction(config)
-# Forward pass
-input_data = torch.randn(32, 784)  # Batch of 32 samples
-outputs = model(input_values=input_data)
-print(f"Reconstruction loss: {outputs.loss}")
-print(f"Latent shape: {outputs.last_hidden_state.shape}")
-print(f"Reconstructed shape: {outputs.reconstructed.shape}")
 ```
-### Training with Hugging Face Trainer
 ```python
-from transformers import Trainer, TrainingArguments
-from torch.utils.data import Dataset
-class AutoencoderDataset(Dataset):
-    def __init__(self, data):
-        self.data = torch.FloatTensor(data)
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        return {
-            "input_values": self.data[idx],
-            "labels": self.data[idx]  # For autoencoder, input = target
-        }
-# Prepare data
-train_dataset = AutoencoderDataset(your_training_data)
-val_dataset = AutoencoderDataset(your_validation_data)
-# Training arguments
-training_args = TrainingArguments(
-    output_dir="./autoencoder_output",
-    num_train_epochs=10,
-    per_device_train_batch_size=64,
-    per_device_eval_batch_size=64,
-    warmup_steps=500,
-    weight_decay=0.01,
-    logging_dir="./logs",
-    evaluation_strategy="steps",
-    eval_steps=500,
-    save_steps=1000,
-    load_best_model_at_end=True,
-)
-# Create trainer
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=val_dataset,
-)
-# Train
-trainer.train()
-# Save model
-model.save_pretrained("./my_autoencoder")
-config.save_pretrained("./my_autoencoder")
 ```
-### Using AutoModel Framework
 ```python
-from register_autoencoder import register_autoencoder_models
-from transformers import AutoConfig, AutoModel
-# Register models with AutoModel framework
-register_autoencoder_models()
-# Now you can use standard HF patterns
-config = AutoConfig.from_pretrained("./my_autoencoder")
-model = AutoModel.from_pretrained("./my_autoencoder")
-# Use the model
-outputs = model(input_values=your_data)
 ```
-## ⚙️ Configuration Options
-The `AutoencoderConfig` class supports extensive customization:
 ```python
-config = AutoencoderConfig(
-    input_dim=784,                    # Input dimension
-    hidden_dims=[512, 256, 128],      # Encoder hidden layers
-    latent_dim=64,                    # Latent space dimension
-    activation="gelu",                # Activation function (see full list below)
-    dropout_rate=0.1,                 # Dropout rate (0.0 to 1.0)
-    use_batch_norm=True,              # Use batch normalization
-    tie_weights=False,                # Tie encoder/decoder weights
-    reconstruction_loss="mse",        # Loss function (see full list below)
-    autoencoder_type="variational",   # Autoencoder type (see types below)
-    beta=0.5,                         # Beta parameter for β-VAE
-    temperature=1.0,                  # Temperature for Gumbel softmax
-    noise_factor=0.1,                 # Noise factor for denoising AE
-    # Recurrent autoencoder parameters
-    rnn_type="lstm",                  # RNN type: "lstm", "gru", "rnn"
-    num_layers=2,                     # Number of RNN layers
-    bidirectional=True,               # Bidirectional encoding
-    sequence_length=None,             # Fixed sequence length (None for variable)
-    teacher_forcing_ratio=0.5,        # Teacher forcing ratio during training
-    # Learnable preprocessing parameters
-    use_learnable_preprocessing=False, # Enable learnable preprocessing
-    preprocessing_type="none",        # "none", "neural_scaler", "normalizing_flow"
-    preprocessing_hidden_dim=64,      # Hidden dimension for preprocessing networks
-    preprocessing_num_layers=2,       # Number of layers in preprocessing networks
-    learn_inverse_preprocessing=True, # Learn inverse transformation
-    flow_coupling_layers=4,           # Number of coupling layers for flows
 )
 ```
-### 🎛️ Available Activation Functions
-**Standard Activations:**
-- `relu`, `leaky_relu`, `relu6`, `elu`, `prelu`
-- `tanh`, `sigmoid`, `hardsigmoid`, `hardtanh`
-- `gelu`, `swish`, `silu`, `hardswish`
-- `mish`, `softplus`, `softsign`, `tanhshrink`, `threshold`
-### 📊 Available Loss Functions
-**Regression Losses:**
-- `mse` - Mean Squared Error
-- `l1` - L1/MAE Loss
-- `huber` - Huber Loss
-- `smooth_l1` - Smooth L1 Loss
-**Classification/Probability Losses:**
-- `bce` - Binary Cross Entropy
-- `kl_div` - KL Divergence
-- `focal` - Focal Loss
-**Similarity Losses:**
-- `cosine` - Cosine Similarity Loss
-- `ssim` - Structural Similarity Loss
-- `perceptual` - Perceptual Loss
-**Segmentation Losses:**
-- `dice` - Dice Loss
-- `tversky` - Tversky Loss
-### 🏗️ Available Autoencoder Types
-**Classic Autoencoder (`classic`)**
-- Standard encoder-decoder architecture
-- Direct reconstruction loss minimization
-**Variational Autoencoder (`variational`)**
-- Probabilistic latent space with mean and variance
-- KL divergence regularization
-- Reparameterization trick for sampling
-**Beta-VAE (`beta_vae`)**
-- Variational autoencoder with adjustable β parameter
-- Better disentanglement of latent factors
-**Denoising Autoencoder (`denoising`)**
-- Adds noise to input during training
-- Learns robust representations
-- Configurable noise factor
-**Sparse Autoencoder (`sparse`)**
-- Encourages sparse latent representations
-- L1 regularization on latent activations
-- Useful for feature selection
-**Contractive Autoencoder (`contractive`)**
-- Penalizes large gradients of latent w.r.t. input
-- Learns smooth manifold representations
-- Robust to small input perturbations
-**Recurrent Autoencoder (`recurrent`)**
-- LSTM/GRU/RNN encoder-decoder architecture
-- Bidirectional encoding for better sequence representations
-- Variable length sequence support with padding
-- Teacher forcing during training for stable learning
-- Sequence-to-sequence reconstruction
 ```
-## 📊 Model Outputs
-### AutoencoderOutput
-The base model `AutoencoderModel` returns the following output:
-```
-```python
-@dataclass
-class AutoencoderOutput(ModelOutput):
-    last_hidden_state: torch.FloatTensor = None    # Latent representation
-    reconstructed: torch.FloatTensor = None        # Reconstructed input
-    hidden_states: Tuple[torch.FloatTensor] = None # Intermediate states
-    attentions: Tuple[torch.FloatTensor] = None    # Not used
-```
-### AutoencoderForReconstructionOutput
 ```python
-@dataclass
-class AutoencoderForReconstructionOutput(ModelOutput):
-    loss: torch.FloatTensor = None                 # Reconstruction loss
-    reconstructed: torch.FloatTensor = None        # Reconstructed input
-    last_hidden_state: torch.FloatTensor = None    # Latent representation
-    hidden_states: Tuple[torch.FloatTensor] = None # Intermediate states
 ```
-## 🔬 Advanced Usage
-### Custom Loss Functions
-You can easily extend the model with custom loss functions:
 ```python
-class CustomAutoencoder(AutoencoderForReconstruction):
-    def _compute_reconstruction_loss(self, reconstructed, target):
-        # Custom loss implementation
-        return your_custom_loss(reconstructed, target)
 ```
-### Recurrent Autoencoder for Sequences
-Perfect for time series, text, and sequential data:
 ```python
-config = AutoencoderConfig(
-    input_dim=50,              # Feature dimension per timestep
-    latent_dim=32,             # Compressed representation size
-    autoencoder_type="recurrent",
-    rnn_type="lstm",           # or "gru", "rnn"
-    num_layers=2,              # Number of RNN layers
-    bidirectional=True,        # Bidirectional encoding
-    teacher_forcing_ratio=0.7, # Teacher forcing during training
-    sequence_length=None       # Variable length sequences
-)
-# Usage with sequence data
-model = AutoencoderForReconstruction(config)
-sequence_data = torch.randn(batch_size, seq_len, input_dim)
-outputs = model(input_values=sequence_data)
 ```
-### Learnable Preprocessing
-Deep learning-based data normalization that adapts to your data:
 ```python
-# Neural Scaler - Learnable alternative to StandardScaler
-config = AutoencoderConfig(
-    input_dim=20,
-    latent_dim=10,
-    use_learnable_preprocessing=True,
-    preprocessing_type="neural_scaler",
-    preprocessing_hidden_dim=64
-)
-# Normalizing Flow - Invertible transformations
-config = AutoencoderConfig(
-    input_dim=20,
-    latent_dim=10,
-    use_learnable_preprocessing=True,
-    preprocessing_type="normalizing_flow",
-    flow_coupling_layers=4
-)
-# Works with all autoencoder types and sequence data
-model = AutoencoderForReconstruction(config)
-outputs = model(input_values=data)
-print(f"Preprocessing loss: {outputs.preprocessing_loss}")
 ```
 ```python
-# Learnable MinMax Scaler - scales to [0, 1] with learnable bounds
-config = AutoencoderConfig(
-    input_dim=20,
-    latent_dim=10,
-    use_learnable_preprocessing=True,
-    preprocessing_type="minmax_scaler",
-)
-# Learnable Robust Scaler - robust to outliers using median/IQR
-config = AutoencoderConfig(
-    input_dim=20,
-    latent_dim=10,
-    use_learnable_preprocessing=True,
-    preprocessing_type="robust_scaler",
-)
-# Learnable Yeo-Johnson - power transform for skewed distributions
-config = AutoencoderConfig(
-    input_dim=20,
-    latent_dim=10,
-    use_learnable_preprocessing=True,
-    preprocessing_type="yeo_johnson",
-)
 ```
-### Variational Autoencoder Extension
-The configuration supports variational autoencoders:
 ```python
-config = AutoencoderConfig(
-    autoencoder_type="variational",
-    beta=0.5,  # β-VAE parameter
-    # ... other parameters
-)
-```
-### Integration with Datasets Library
-```python
-from datasets import Dataset
-# Convert your data to HF Dataset
-dataset = Dataset.from_dict({
-    "input_values": your_data_list
-})
-# Use with Trainer
 trainer = Trainer(
     model=model,
-    train_dataset=dataset,
-    # ... other arguments
 )
-```
-## 📁 Project Structure
-```
-autoencoder/
-├── __init__.py                    # Package initialization
-├── configuration_autoencoder.py   # Configuration class
-├── modeling_autoencoder.py        # Model implementations
-├── register_autoencoder.py        # AutoModel registration
-├── pyproject.toml                 # Project metadata and dependencies
-└── README.md                      # This file
 ```
-## 🤝 Contributing
-This implementation follows Hugging Face conventions and can be easily extended:
-1. **Adding new architectures**: Extend `AutoencoderModel` or create new model classes
-2. **Custom configurations**: Add parameters to `AutoencoderConfig`
-3. **Task-specific heads**: Create new classes like `AutoencoderForReconstruction`
-4. **Integration**: Register new models with the AutoModel framework
-## 📚 References
-- [Hugging Face Transformers Documentation](https://huggingface.co/docs/transformers)
-- [Custom Models Guide](https://huggingface.co/docs/transformers/custom_models)
-- [AutoModel Documentation](https://huggingface.co/docs/transformers/model_doc/auto)
-## 🎯 Use Cases
-This autoencoder implementation is perfect for:
-- **Dimensionality Reduction**: Compress high-dimensional data to lower dimensions
-- **Anomaly Detection**: Identify outliers based on reconstruction error
-- **Data Denoising**: Remove noise from corrupted data
-- **Feature Learning**: Learn meaningful representations for downstream tasks
-- **Data Generation**: Generate new samples similar to training data
-- **Pretraining**: Initialize encoders for other tasks
-## 🔍 Model Comparison
-| Feature | Standard PyTorch | This Implementation |
-|---------|------------------|-------------------|
-| HF Integration | ❌ | ✅ |
-| AutoModel Support | ❌ | ✅ |
-| Trainer Compatible | ❌ | ✅ |
-| Hub Integration | ❌ | ✅ |
-| Config Management | Manual | ✅ Automatic |
-| Serialization | Manual | ✅ Built-in |
-| Checkpointing | Manual | ✅ Built-in |
-## 🚀 Performance Tips
-1. **Batch Size**: Use larger batch sizes for better GPU utilization
-2. **Learning Rate**: Start with 1e-3 and adjust based on convergence
-3. **Architecture**: Gradually decrease hidden dimensions for better compression
-4. **Regularization**: Use dropout and batch normalization for better generalization
-5. **Loss Function**: Choose appropriate loss based on your data type
-## 📄 License
-This implementation is provided as an example and follows the same license terms as Hugging Face Transformers.

   - scaler
 ---
+## Autoencoder for Hugging Face Transformers (Block-based)
+A flexible, production-grade Autoencoder implementation built to fit naturally into the Transformers ecosystem. It supports a new block-based architecture with ready-to-use templates for classic MLP, VAE/beta-VAE, Transformer, Recurrent, Convolutional, mixed hybrids, and learnable preprocessing.
+### Key features
+- Block-based architecture: Linear, Attention, Recurrent (LSTM/GRU), Convolutional, Variational blocks
+- Class-based configuration presets in template.py for quick starts
+- Variational and beta-VAE variants (KL-controlled)
+- Learnable preprocessing and inverse transforms
+- Hugging Face-compatible config/model API and from_pretrained/save_pretrained
+## Install and load from the Hub (code repo)
 ```python
 from huggingface_hub import snapshot_download
 import sys, torch
 repo_dir = snapshot_download(
     repo_id="amaye15/autoencoder",
     repo_type="model",
+    allow_patterns=["*.py", "config.json", "*.safetensors"],
 )
 sys.path.append(repo_dir)
 from modeling_autoencoder import AutoencoderForReconstruction
 model = AutoencoderForReconstruction.from_pretrained(repo_dir)
 x = torch.randn(8, 20)
 out = model(input_values=x)
 print("latent:", out.last_hidden_state.shape, "reconstructed:", out.reconstructed.shape)
 ```
+## Quickstart with class-based templates
+```python
+from modeling_autoencoder import AutoencoderModel
+from template import ClassicAutoencoderConfig
+cfg = ClassicAutoencoderConfig(input_dim=784, latent_dim=64)
+model = AutoencoderModel(cfg)
+x = torch.randn(4, 784)
+out = model(x, return_dict=True)
+print(out.last_hidden_state.shape, out.reconstructed.shape)
+```
+### Available presets (template.py)
+- ClassicAutoencoderConfig: Dense MLP AE
+- VariationalAutoencoderConfig: VAE with KL regularization
+- BetaVariationalAutoencoderConfig: beta-VAE (beta > 1)
+- TransformerAutoencoderConfig: Attention-based encoder for sequences
+- RecurrentAutoencoderConfig: LSTM/GRU encoder for sequences
+- ConvolutionalAutoencoderConfig: 1D Conv encoder for sequences
+- ConvAttentionAutoencoderConfig: Mixed Conv + Attention encoder
+- LinearRecurrentAutoencoderConfig: Linear down-projection + RNN
+- PreprocessedAutoencoderConfig: MLP AE with learnable preprocessing
+## Block-based architecture
+The autoencoder uses a modular block system where you define encoder_blocks and decoder_blocks as lists of dictionaries. Each block dict specifies its type and parameters.
+### Available block types
+#### LinearBlock
+Dense layer with optional normalization, activation, dropout, and residual connections.
 ```python
+{
+    "type": "linear",
+    "input_dim": 256,
+    "output_dim": 128,
+    "activation": "relu",           # relu, gelu, tanh, sigmoid, etc.
+    "normalization": "batch",       # batch, layer, group, instance, none
+    "dropout_rate": 0.1,
+    "use_residual": False,          # adds skip connection if input_dim == output_dim
+    "residual_scale": 1.0
+}
 ```
+#### AttentionBlock
+Multi-head self-attention with feed-forward network. Works with 2D (B, D) or 3D (B, T, D) inputs.
 ```python
+{
+    "type": "attention",
+    "input_dim": 128,
+    "num_heads": 8,
+    "ffn_dim": 512,                 # if None, defaults to 4 * input_dim
+    "dropout_rate": 0.1
+}
+```
+#### RecurrentBlock
+LSTM, GRU, or vanilla RNN encoder. Outputs final hidden state or all timesteps.
+```python
+{
+    "type": "recurrent",
+    "input_dim": 64,
+    "hidden_size": 128,
+    "num_layers": 2,
+    "rnn_type": "lstm",             # lstm, gru, rnn
+    "bidirectional": True,
+    "dropout_rate": 0.1,
+    "output_dim": 128               # final output dimension
+}
 ```
+#### ConvolutionalBlock
+1D convolution for sequence data. Expects 3D input (B, T, D).
 ```python
+{
+    "type": "conv1d",
+    "input_dim": 64,                # input channels
+    "output_dim": 128,              # output channels
+    "kernel_size": 3,
+    "padding": "same",              # "same" or integer
+    "activation": "relu",
+    "normalization": "batch",
+    "dropout_rate": 0.1
+}
+```
+#### VariationalBlock
+Produces mu and logvar for VAE reparameterization. Used internally by the model when autoencoder_type="variational".
+```python
+{
+    "type": "variational",
+    "input_dim": 128,
+    "latent_dim": 64
+}
 ```
+### Custom configuration examples
+#### Mixed architecture (Conv + Attention + Linear)
 ```python
+from configuration_autoencoder import AutoencoderConfig
+enc = [
+    # 1D convolution for local patterns
+    {"type": "conv1d", "input_dim": 64, "output_dim": 128, "kernel_size": 3, "padding": "same", "activation": "relu"},
+    {"type": "conv1d", "input_dim": 128, "output_dim": 128, "kernel_size": 3, "padding": "same", "activation": "relu"},
+    # Self-attention for global dependencies
+    {"type": "attention", "input_dim": 128, "num_heads": 8, "ffn_dim": 512, "dropout_rate": 0.1},
+    # Final linear projection
+    {"type": "linear", "input_dim": 128, "output_dim": 64, "activation": "relu", "normalization": "batch"}
+]
+dec = [
+    {"type": "linear", "input_dim": 32, "output_dim": 64, "activation": "relu", "normalization": "batch"},
+    {"type": "linear", "input_dim": 64, "output_dim": 128, "activation": "relu", "normalization": "batch"},
+    {"type": "linear", "input_dim": 128, "output_dim": 64, "activation": "identity", "normalization": "none"}
+]
+cfg = AutoencoderConfig(
+    input_dim=64,
+    latent_dim=32,
+    autoencoder_type="classic",
+    encoder_blocks=enc,
+    decoder_blocks=dec
 )
 ```
+#### Hierarchical encoder (multiple scales)
+```python
+enc = [
+    # Local features
+    {"type": "linear", "input_dim": 784, "output_dim": 512, "activation": "relu", "normalization": "batch"},
+    {"type": "linear", "input_dim": 512, "output_dim": 256, "activation": "relu", "normalization": "batch"},
+    # Mid-level features with residual
+    {"type": "linear", "input_dim": 256, "output_dim": 256, "activation": "relu", "normalization": "batch", "use_residual": True},
+    {"type": "linear", "input_dim": 256, "output_dim": 256, "activation": "relu", "normalization": "batch", "use_residual": True},
+    # High-level features
+    {"type": "linear", "input_dim": 256, "output_dim": 128, "activation": "relu", "normalization": "batch"},
+    {"type": "linear", "input_dim": 128, "output_dim": 64, "activation": "relu", "normalization": "batch"}
+]
 ```
+#### Sequence-to-sequence with recurrent encoder
+```python
+enc = [
+    {"type": "recurrent", "input_dim": 100, "hidden_size": 128, "num_layers": 2, "rnn_type": "lstm", "bidirectional": True, "output_dim": 256},
+    {"type": "linear", "input_dim": 256, "output_dim": 128, "activation": "tanh", "normalization": "layer"}
+]
+dec = [
+    {"type": "linear", "input_dim": 64, "output_dim": 128, "activation": "tanh", "normalization": "layer"},
+    {"type": "linear", "input_dim": 128, "output_dim": 100, "activation": "identity", "normalization": "none"}
+]
+```
+### Input shape handling
+- **2D inputs (B, D)**: Work with Linear blocks directly. Attention/Recurrent/Conv blocks treat as (B, 1, D)
+- **3D inputs (B, T, D)**: Work with all block types. Linear blocks operate per-timestep
+- **Output shapes**: Decoder typically outputs same shape as input. For sequence models, final shape depends on decoder architecture
+## Configuration (configuration_autoencoder.py)
+AutoencoderConfig is the core configuration class. Important fields:
+- input_dim: feature dimension (D)
+- latent_dim: latent size
+- encoder_blocks, decoder_blocks: block lists (see block types above)
+- activation, dropout_rate, use_batch_norm: defaults used by some presets
+- autoencoder_type: classic | variational | beta_vae | denoising | sparse | contractive | recurrent
+- Reconstruction losses: mse | bce | l1 | huber | smooth_l1 | kl_div | cosine | focal | dice | tversky | ssim | perceptual
+- Preprocessing: use_learnable_preprocessing, preprocessing_type, learn_inverse_preprocessing
+Example:
 ```python
+from configuration_autoencoder import AutoencoderConfig
+cfg = AutoencoderConfig(
+    input_dim=128,
+    latent_dim=32,
+    autoencoder_type="variational",
+    encoder_blocks=[{"type": "linear", "input_dim": 128, "output_dim": 64, "activation": "relu"}],
+    decoder_blocks=[{"type": "linear", "input_dim": 32, "output_dim": 128, "activation": "identity", "normalization": "none"}],
+)
 ```
+## Models (modeling_autoencoder.py)
+Main classes:
+- AutoencoderModel: core module exposing forward that returns last_hidden_state (latent) and reconstructed
+- AutoencoderForReconstruction: HF-compatible model wrapper with from_pretrained/save_pretrained
+Forward usage:
+```python
+from modeling_autoencoder import AutoencoderModel
+x = torch.randn(8, 20)
+out = model(x, return_dict=True)
+print(out.last_hidden_state.shape, out.reconstructed.shape)
+```
+### Variational behavior
+If cfg.autoencoder_type == "variational" or "beta_vae":
+- The model uses an internal VariationalBlock to compute mu and logvar
+- Samples z during training; uses mu during eval
+- KL term available via model._mu/_logvar (exposed in hidden_states when requested)
 ```python
+out = model(x, return_dict=True, output_hidden_states=True)
+latent, mu, logvar = out.hidden_states
 ```
+## Preprocessing (preprocessing.py)
+- PreprocessingBlock wraps LearnablePreprocessor and can be placed before/after the core encoder/decoder
+- When enabled via config.use_learnable_preprocessing, the model constructs two blocks: pre (forward) and post (inverse)
+- The block tracks reg_loss, which is added to preprocessing_loss in the model output
 ```python
+from template import PreprocessedAutoencoderConfig
+cfg = PreprocessedAutoencoderConfig(input_dim=64, latent_dim=32, preprocessing_type="neural_scaler")
+model = AutoencoderModel(cfg)
 ```
+## Utilities (utils.py)
+Common helpers:
+- _get_activation(name)
+- _get_norm(name, num_groups=None)
+- _flatten_3d_to_2d(x), _maybe_restore_3d(x, ref)
+## Training examples
+### Basic MSE reconstruction
 ```python
+from modeling_autoencoder import AutoencoderModel
+from template import ClassicAutoencoderConfig
+cfg = ClassicAutoencoderConfig(input_dim=784, latent_dim=64)
+model = AutoencoderModel(cfg)
+opt = torch.optim.Adam(model.parameters(), lr=1e-3)
+for x in dataloader:  # x: (B, 784)
+    out = model(x, return_dict=True)
+    loss = torch.nn.functional.mse_loss(out.reconstructed, x)
+    loss.backward(); opt.step(); opt.zero_grad()
 ```
+### VAE with KL term
 ```python
+from template import VariationalAutoencoderConfig
+cfg = VariationalAutoencoderConfig(input_dim=784, latent_dim=32)
+model = AutoencoderModel(cfg)
+for x in dataloader:
+    out = model(x, return_dict=True, output_hidden_states=True)
+    recon = torch.nn.functional.mse_loss(out.reconstructed, x)
+    _, mu, logvar = out.hidden_states
+    kl = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
+    loss = recon + cfg.beta * kl
+    loss.backward(); opt.step(); opt.zero_grad()
+```
+### Sequence reconstruction (Conv + Attention)
+```python
+from template import ConvAttentionAutoencoderConfig
+cfg = ConvAttentionAutoencoderConfig(input_dim=64, latent_dim=64)
+model = AutoencoderModel(cfg)
+x = torch.randn(8, 50, 64)  # (B, T, D)
+out = model(x, return_dict=True)
 ```
+## End-to-end saving/loading
+```python
+from modeling_autoencoder import AutoencoderForReconstruction
+model.save_pretrained("./my_ae")
+reloaded = AutoencoderForReconstruction.from_pretrained("./my_ae")
+```
+## Troubleshooting
+- Check that block input_dim/output_dim align across adjacent blocks
+- For attention/recurrent/conv blocks, prefer 3D inputs (B, T, D). 2D inputs are coerced to (B, 1, D)
+- For variational/beta-VAE, ensure latent_dim is set; KL term available via hidden states
+- When preprocessing is enabled, preprocessing_loss is included in the output for logging/regularization
+## Full AutoencoderConfig reference
+Below is a comprehensive reference for all fields in configuration_autoencoder.AutoencoderConfig. Some fields are primarily used by presets or advanced features but are documented here for completeness.
+- input_dim (int, default=784): Input feature dimension D. For sequences, D is per-timestep feature size.
+- hidden_dims (List[int], default=[512,256,128]): Legacy convenience list for simple MLPs. Prefer encoder_blocks.
+- encoder_blocks (List[dict] | None): Block list for encoder. See Block-based architecture for block schemas.
+- decoder_blocks (List[dict] | None): Block list for decoder. If omitted, model may derive a simple decoder from hidden_dims.
+- latent_dim (int, default=64): Latent space dimension.
+- activation (str, default="relu"): Default activation for Linear blocks when using legacy paths or presets.
+- dropout_rate (float, default=0.1): Default dropout used in presets and some layers.
+- use_batch_norm (bool, default=True): Default normalization flag used in presets ("batch" if True, else "none").
+- tie_weights (bool, default=False): If True, share/tie encoder and decoder weights (feature not always active depending on architecture).
+- reconstruction_loss (str, default="mse"): Which loss to use in AutoencoderForReconstruction. One of:
+  - "mse", "bce", "l1", "huber", "smooth_l1", "kl_div", "cosine", "focal", "dice", "tversky", "ssim", "perceptual".
+- autoencoder_type (str, default="classic"): Architecture variant. One of:
+  - "classic", "variational", "beta_vae", "denoising", "sparse", "contractive", "recurrent".
+- beta (float, default=1.0): KL weight for VAE/beta-VAE.
+- temperature (float, default=1.0): Reserved for temperature-based operations.
+- noise_factor (float, default=0.1): Denoising strength used by Denoising variants.
+- rnn_type (str, default="lstm"): For recurrent variants. One of: "lstm", "gru", "rnn".
+- num_layers (int, default=2): Number of RNN layers for recurrent variants.
+- bidirectional (bool, default=True): Whether RNN is bidirectional in recurrent variants.
+- sequence_length (int | None, default=None): Optional fixed sequence length; if None, variable length is supported.
+- teacher_forcing_ratio (float, default=0.5): For recurrent decoders that use teacher forcing.
+- use_learnable_preprocessing (bool, default=False): Enable learnable preprocessing.
+- preprocessing_type (str, default="none"): One of: "none", "neural_scaler", "normalizing_flow", "minmax_scaler", "robust_scaler", "yeo_johnson".
+- preprocessing_hidden_dim (int, default=64): Hidden size for preprocessing networks.
+- preprocessing_num_layers (int, default=2): Number of layers for preprocessing networks.
+- learn_inverse_preprocessing (bool, default=True): Whether to learn inverse transform for reconstruction.
+- flow_coupling_layers (int, default=4): Number of coupling layers for normalizing flows.
+Derived helpers and flags:
+- has_block_lists: True if either encoder_blocks or decoder_blocks is provided.
+- is_variational: True if autoencoder_type in {"variational", "beta_vae"}.
+- is_denoising, is_sparse, is_contractive, is_recurrent: Variant flags.
+- has_preprocessing: True if preprocessing enabled and type != "none".
+Validation notes:
+- activation must be one of the supported list in configuration_autoencoder.py
+- reconstruction_loss must be one of the supported list
+- Many numeric parameters are validated to be positive or within [0,1]
+## Training with Hugging Face Trainer
+The AutoencoderForReconstruction model computes reconstruction loss internally using config.reconstruction_loss. For VAEs/beta-VAEs, it adds the KL term scaled by config.beta. You can plug it directly into transformers.Trainer.
 ```python
+from transformers import Trainer, TrainingArguments
+from modeling_autoencoder import AutoencoderForReconstruction
+from template import ClassicAutoencoderConfig
+import torch
+from torch.utils.data import Dataset
+# 1) Config and model
+cfg = ClassicAutoencoderConfig(input_dim=64, latent_dim=16)
+model = AutoencoderForReconstruction(cfg)
+# 2) Dummy dataset (replace with your own)
+class ToyAEDataset(Dataset):
+    def __init__(self, n=1024, d=64):
+        self.x = torch.randn(n, d)
+    def __len__(self):
+        return self.x.size(0)
+    def __getitem__(self, idx):
+        xi = self.x[idx]
+        return {"input_values": xi, "labels": xi}
+train_ds = ToyAEDataset()
+# 3) TrainingArguments
+args = TrainingArguments(
+    output_dir="./ae-trainer",
+    per_device_train_batch_size=64,
+    learning_rate=1e-3,
+    num_train_epochs=3,
+    logging_steps=50,
+    save_steps=200,
+    report_to=[],  # disable wandb if not configured
+)
+# 4) Trainer
 trainer = Trainer(
     model=model,
+    args=args,
+    train_dataset=train_ds,
 )
+# 5) Train
+trainer.train()
+# 6) Use the model
+x = torch.randn(4, 64)
+out = model(input_values=x, return_dict=True)
+print(out.last_hidden_state.shape, out.reconstructed.shape)
 ```
+Notes:
+- The dataset must yield dicts with "input_values" and optionally "labels"; if labels are missing, the model uses input as the target.
+- For sequence inputs, shape is (B, T, D). For simple vectors, (B, D).
+- Set cfg.reconstruction_loss to e.g. "bce" to switch the internal loss (the decoder head applies sigmoid when BCE is used).
+- For VAE/beta-VAE, use VariationalAutoencoderConfig/BetaVariationalAutoencoderConfig.
+### Example using AutoencoderConfig directly
+Below shows how to define a configuration purely with block dicts using AutoencoderConfig, without the template classes.
+```python
+from configuration_autoencoder import AutoencoderConfig
+from modeling_autoencoder import AutoencoderModel
+import torch
+# Encoder: Linear -> Attention -> Linear
+enc = [
+    {"type": "linear", "input_dim": 128, "output_dim": 128, "activation": "relu", "normalization": "batch", "dropout_rate": 0.1},
+    {"type": "attention", "input_dim": 128, "num_heads": 4, "ffn_dim": 512, "dropout_rate": 0.1},
+    {"type": "linear", "input_dim": 128, "output_dim": 64, "activation": "relu", "normalization": "batch"},
+]
+# Decoder: Linear -> Linear (final identity)
+dec = [
+    {"type": "linear", "input_dim": 32, "output_dim": 64, "activation": "relu", "normalization": "batch"},
+    {"type": "linear", "input_dim": 64, "output_dim": 128, "activation": "identity", "normalization": "none"},
+]
+cfg = AutoencoderConfig(
+    input_dim=128,
+    latent_dim=32,
+    encoder_blocks=enc,
+    decoder_blocks=dec,
+    autoencoder_type="classic",
+)
+model = AutoencoderModel(cfg)
+x = torch.randn(4, 128)
+out = model(x, return_dict=True)
+print(out.last_hidden_state.shape, out.reconstructed.shape)
+```
+For a variational model, set autoencoder_type="variational" and the model will internally use a VariationalBlock for mu/logvar and sampling.
+## Learnable preprocessing
+Enable learnable preprocessing and its inverse with the PreprocessedAutoencoderConfig class or via flags.
+```python
+from template import PreprocessedAutoencoderConfig
+cfg = PreprocessedAutoencoderConfig(input_dim=64, latent_dim=32, preprocessing_type="neural_scaler")
+```
+Supported preprocessing_type values include: "neural_scaler", "normalizing_flow", "minmax_scaler", "robust_scaler", "yeo_johnson".
+## Saving and loading
+```python
+from modeling_autoencoder import AutoencoderForReconstruction
+# Save
+model.save_pretrained("./my_ae")
+# Load
+reloaded = AutoencoderForReconstruction.from_pretrained("./my_ae")
+```
+## Reference
+Core modules:
+- configuration_autoencoder.AutoencoderConfig
+- modeling_autoencoder.AutoencoderModel, AutoencoderForReconstruction
+- blocks: BlockFactory, BlockSequence, Linear/Attention/Recurrent/Convolutional/Variational blocks
+- preprocessing: PreprocessingBlock (learnable preprocessing wrapper)
+- template: class-based presets listed above
+## License
+Apache-2.0 (see LICENSE)

blocks.py ADDED Viewed

	@@ -0,0 +1,446 @@

+"""
+Modular, block-based components for building autoencoders in PyTorch.
+Core goals:
+- Composable building blocks with consistent interfaces
+- Support 2D (B, F) and 3D (B, T, F) tensors where applicable
+- Simple configs to construct blocks and sequences
+- Safe-by-default validation and helpful errors
+This module is intentionally self-contained to allow gradual integration with
+existing models. It does not mutate current behavior.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Import config dataclasses that define block configurations
+try:
+    from .configuration_autoencoder import (
+        BlockConfig,
+        LinearBlockConfig,
+        AttentionBlockConfig,
+        RecurrentBlockConfig,
+        ConvolutionalBlockConfig,
+        VariationalBlockConfig,
+    )
+except Exception:
+    from configuration_autoencoder import (
+        BlockConfig,
+        LinearBlockConfig,
+        AttentionBlockConfig,
+        RecurrentBlockConfig,
+        ConvolutionalBlockConfig,
+        VariationalBlockConfig,
+    )
+# Import shared utilities
+try:
+    from .utils import _get_activation, _get_norm, _flatten_3d_to_2d, _maybe_restore_3d
+except Exception:
+    from utils import _get_activation, _get_norm, _flatten_3d_to_2d, _maybe_restore_3d
+# ---------------------------- Base Block ---------------------------- #
+class BaseBlock(nn.Module):
+    """Abstract base for all blocks.
+    All blocks should accept 2D (B, F) or 3D (B, T, F) tensors and return the
+    same rank, with last-dim equal to `output_dim`.
+    """
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:  # pragma: no cover - abstract
+        raise NotImplementedError
+    @property
+    def output_dim(self) -> int:  # pragma: no cover - abstract
+        raise NotImplementedError
+# ---------------------------- Residual Base ---------------------------- #
+class ResidualBlock(BaseBlock):
+    """Base class for blocks supporting residual connections.
+    Implements a safe residual add when input and output dims match; otherwise
+    falls back to a learned projection. Residuals can be scaled.
+    """
+    def __init__(self, residual: bool = False, residual_scale: float = 1.0, proj_dim_in: Optional[int] = None, proj_dim_out: Optional[int] = None):
+        super().__init__()
+        self.use_residual = residual
+        self.residual_scale = residual_scale
+        self._proj: Optional[nn.Module] = None
+        if residual and proj_dim_in is not None and proj_dim_out is not None and proj_dim_in != proj_dim_out:
+            self._proj = nn.Linear(proj_dim_in, proj_dim_out)
+    def _apply_residual(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        if not self.use_residual:
+            return y
+        x2d, hint = _flatten_3d_to_2d(x)
+        y2d, _ = _flatten_3d_to_2d(y)
+        if x2d.shape[-1] != y2d.shape[-1]:
+            if self._proj is None:
+                self._proj = nn.Linear(x2d.shape[-1], y2d.shape[-1]).to(y2d.device)
+            x2d = self._proj(x2d)
+        out = x2d + self.residual_scale * y2d
+        return _maybe_restore_3d(out, hint)
+# ---------------------------- LinearBlock ---------------------------- #
+class LinearBlock(ResidualBlock):
+    """Basic linear transformation with normalization and activation.
+    - Handles both 2D (B, F) and 3D (B, T, F) tensors
+    - Optional normalization: batch|layer|group|instance|none
+    - Configurable activation
+    - Optional dropout
+    - Optional residual connection (with auto projection)
+    """
+    def __init__(self, cfg: LinearBlockConfig):
+        super().__init__(residual=cfg.use_residual, residual_scale=cfg.residual_scale, proj_dim_in=cfg.input_dim, proj_dim_out=cfg.output_dim)
+        self.cfg = cfg
+        self.linear = nn.Linear(cfg.input_dim, cfg.output_dim)
+        # Normalizations that expect N, C require 2D tensors; for 3D we flatten
+        # For LayerNorm, it supports last-dim directly
+        if cfg.normalization == "layer":
+            self.norm = nn.LayerNorm(cfg.output_dim)
+        else:
+            self.norm = _get_norm(cfg.normalization, cfg.output_dim)
+        self.act = _get_activation(cfg.activation)
+        self.drop = nn.Dropout(cfg.dropout_rate) if cfg.dropout_rate and cfg.dropout_rate > 0 else nn.Identity()
+    @property
+    def output_dim(self) -> int:
+        return self.cfg.output_dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_in = x
+        x2d, hint = _flatten_3d_to_2d(x)
+        y = self.linear(x2d)
+        # Apply norm safely
+        if isinstance(self.norm, (nn.BatchNorm1d, nn.InstanceNorm1d, nn.GroupNorm)):
+            y = self.norm(y)
+        else:
+            # LayerNorm or Identity operates on last dim and supports both 2D/3D; we already have 2D
+            y = self.norm(y)
+        y = self.act(y)
+        y = self.drop(y)
+        y = _maybe_restore_3d(y, hint)
+        return self._apply_residual(x_in, y)
+# ---------------------------- AttentionBlock ---------------------------- #
+class AttentionBlock(BaseBlock):
+    """Multi-head self-attention with optional FFN.
+    Expects inputs as 3D (B, T, D) or 2D (B, D) which will be treated as (B, 1, D).
+    Supports optional attn mask and key padding mask via kwargs.
+    """
+    def __init__(self, cfg: AttentionBlockConfig):
+        super().__init__()
+        self.cfg = cfg
+        d_model = cfg.input_dim
+        self.mha = nn.MultiheadAttention(d_model, num_heads=cfg.num_heads, dropout=cfg.dropout_rate, batch_first=True)
+        self.ln1 = nn.LayerNorm(d_model)
+        ffn_dim = cfg.ffn_dim or (4 * d_model)
+        self.ffn = nn.Sequential(
+            nn.Linear(d_model, ffn_dim),
+            _get_activation("gelu"),
+            nn.Dropout(cfg.dropout_rate),
+            nn.Linear(ffn_dim, d_model),
+        )
+        self.ln2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(cfg.dropout_rate)
+    @property
+    def output_dim(self) -> int:
+        return self.cfg.input_dim
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+            squeeze_back = True
+        else:
+            squeeze_back = False
+        # Self-attention
+        residual = x
+        attn_out, _ = self.mha(x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=False)
+        x = self.ln1(residual + self.dropout(attn_out))
+        # FFN
+        residual = x
+        x = self.ffn(x)
+        x = self.ln2(residual + self.dropout(x))
+        if squeeze_back:
+            x = x.squeeze(1)
+        return x
+# ---------------------------- RecurrentBlock ---------------------------- #
+class RecurrentBlock(BaseBlock):
+    """RNN processing block supporting LSTM/GRU/RNN.
+    Input: 3D (B, T, F) preferred. If 2D, treated as (B, 1, F).
+    Output dim equals cfg.output_dim if set; otherwise hidden_size * directions.
+    """
+    def __init__(self, cfg: RecurrentBlockConfig):
+        super().__init__()
+        self.cfg = cfg
+        rnn_type = cfg.rnn_type.lower()
+        rnn_cls = {"lstm": nn.LSTM, "gru": nn.GRU, "rnn": nn.RNN}.get(rnn_type)
+        if rnn_cls is None:
+            raise ValueError(f"Unknown rnn_type: {cfg.rnn_type}")
+        self.rnn = rnn_cls(
+            input_size=cfg.input_dim,
+            hidden_size=cfg.hidden_size,
+            num_layers=cfg.num_layers,
+            batch_first=True,
+            dropout=cfg.dropout_rate if cfg.num_layers > 1 else 0.0,
+            bidirectional=cfg.bidirectional,
+        )
+        out_dim = cfg.hidden_size * (2 if cfg.bidirectional else 1)
+        self._out_dim = cfg.output_dim or out_dim
+        self.proj = None if self._out_dim == out_dim else nn.Linear(out_dim, self._out_dim)
+    @property
+    def output_dim(self) -> int:
+        return self._out_dim
+    def forward(self, x: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> torch.Tensor:
+        squeeze_back = False
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+            squeeze_back = True
+        if lengths is not None:
+            x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
+        if isinstance(self.rnn, nn.LSTM):
+            out, (h, c) = self.rnn(x)
+        else:
+            out, h = self.rnn(x)
+        if lengths is not None:
+            out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
+        # Use last timestep
+        y = out[:, -1, :]
+        if self.proj is not None:
+            y = self.proj(y)
+        if squeeze_back:
+            # Keep 2D output
+            return y
+        # Return (B, 1, D) to keep 3D shape consistent with sequences
+        return y.unsqueeze(1)
+# ---------------------------- ConvolutionalBlock ---------------------------- #
+class ConvolutionalBlock(BaseBlock):
+    """1D convolutional block for sequence-like data.
+    Accepts 3D (B, T, F) or 2D (B, F) which is treated as (B, 1, F).
+    """
+    def __init__(self, cfg: ConvolutionalBlockConfig):
+        super().__init__()
+        self.cfg = cfg
+        # Conv1d expects (B, C_in, L). We interpret features as channels and time as length.
+        # For inputs shaped (B, T, F): we transpose to (B, F, T), apply conv, transpose back.
+        padding = cfg.padding
+        if isinstance(padding, str) and padding == "same":
+            pad = cfg.kernel_size // 2
+        else:
+            pad = int(padding)
+        self.conv = nn.Conv1d(cfg.input_dim, cfg.output_dim, kernel_size=cfg.kernel_size, padding=pad)
+        # Norm: for Conv1d, use 1d norms over channels
+        if cfg.normalization == "layer":
+            self.norm = nn.GroupNorm(1, cfg.output_dim)  # Layer-like over channels
+        else:
+            self.norm = _get_norm(cfg.normalization, cfg.output_dim)
+        self.act = _get_activation(cfg.activation)
+        self.drop = nn.Dropout(cfg.dropout_rate) if cfg.dropout_rate and cfg.dropout_rate > 0 else nn.Identity()
+    @property
+    def output_dim(self) -> int:
+        return self.cfg.output_dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        squeeze_back = False
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+            squeeze_back = True
+        # x: (B, T, F) -> (B, F, T)
+        x = x.transpose(1, 2)
+        y = self.conv(x)
+        if isinstance(self.norm, (nn.BatchNorm1d, nn.InstanceNorm1d, nn.GroupNorm)):
+            y = self.norm(y)
+        y = self.act(y)
+        y = self.drop(y)
+        y = y.transpose(1, 2)
+        if squeeze_back:
+            y = y.squeeze(1)
+        return y
+# ---------------------------- VariationalBlock ---------------------------- #
+class VariationalBlock(BaseBlock):
+    """Encapsulates mu/logvar projection and reparameterization.
+    Input can be 2D (B, F) or 3D (B, T, F); for 3D, operates per timestep and returns same rank.
+    Stores mu/logvar on the module for downstream loss usage.
+    """
+    def __init__(self, cfg: VariationalBlockConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.fc_mu = nn.Linear(cfg.input_dim, cfg.latent_dim)
+        self.fc_logvar = nn.Linear(cfg.input_dim, cfg.latent_dim)
+        self._mu: Optional[torch.Tensor] = None
+        self._logvar: Optional[torch.Tensor] = None
+    @property
+    def output_dim(self) -> int:
+        return self.cfg.latent_dim
+    def forward(self, x: torch.Tensor, training: Optional[bool] = None) -> torch.Tensor:
+        if training is None:
+            training = self.training
+        x2d, hint = _flatten_3d_to_2d(x)
+        mu = self.fc_mu(x2d)
+        logvar = self.fc_logvar(x2d)
+        if training:
+            std = torch.exp(0.5 * logvar)
+            eps = torch.randn_like(std)
+            z = mu + eps * std
+        else:
+            z = mu
+        self._mu = mu
+        self._logvar = logvar
+        z = _maybe_restore_3d(z, hint)
+        return z
+# ---------------------------- BlockSequence ---------------------------- #
+class BlockSequence(nn.Module):
+    """Compose multiple blocks into a validated sequence.
+    - Validates dimension flow between blocks
+    - Supports gradient checkpointing (per-block) via forward(checkpoint=True)
+    - Supports optional skip connections: pass `skips` as list of (src_idx, dst_idx)
+    """
+    def __init__(self, blocks: Sequence[BaseBlock], validate_dims: bool = True, skips: Optional[List[Tuple[int, int]]] = None):
+        super().__init__()
+        self.blocks = nn.ModuleList(blocks)
+        self.skips = skips or []
+        if validate_dims and len(blocks) > 1:
+            for i in range(1, len(blocks)):
+                prev = blocks[i - 1]
+                cur = blocks[i]
+                if getattr(prev, "output_dim", None) is None or getattr(cur, "output_dim", None) is None:
+                    continue
+                if prev.output_dim != cur.output_dim and not isinstance(cur, LinearBlock):
+                    # Allow LinearBlock to change dims; others must preserve unless they project internally
+                    pass  # Only warn; users may know what they're doing
+    def forward(self, x: torch.Tensor, checkpoint: bool = False, **kwargs) -> torch.Tensor:
+        activations: Dict[int, torch.Tensor] = {}
+        for i, block in enumerate(self.blocks):
+            if checkpoint and x.requires_grad:
+                x = torch.utils.checkpoint.checkpoint(lambda inp: block(inp, **kwargs), x)
+            else:
+                x = block(x, **kwargs)
+            activations[i] = x
+            # Apply any pending skips to this idx
+            for src, dst in self.skips:
+                if dst == i and src in activations:
+                    x = x + activations[src]
+        return x
+# ---------------------------- Factory ---------------------------- #
+class BlockFactory:
+    """Factory to build blocks/sequences from configs.
+    This is intentionally minimal; extend as needed.
+    """
+    @staticmethod
+    def build_block(cfg: Union[BlockConfig, Dict[str, Any]]) -> BaseBlock:
+        # Allow dict-like
+        if isinstance(cfg, dict):
+            type_name = cfg.get("type")
+            # copy and remove 'type' to satisfy dataclass init
+            params = dict(cfg)
+            params.pop("type", None)
+            if type_name == "linear":
+                return LinearBlock(LinearBlockConfig(**params))
+            if type_name == "attention":
+                return AttentionBlock(AttentionBlockConfig(**params))
+            if type_name == "recurrent":
+                return RecurrentBlock(RecurrentBlockConfig(**params))
+            if type_name == "conv1d":
+                return ConvolutionalBlock(ConvolutionalBlockConfig(**params))
+            raise ValueError(f"Unsupported block type in dict cfg: {type_name} cfg={cfg}")
+        # Dataclass path
+        if isinstance(cfg, LinearBlockConfig) or getattr(cfg, "type", None) == "linear":
+            if not isinstance(cfg, LinearBlockConfig):
+                cfg = LinearBlockConfig(**cfg.__dict__)  # type: ignore[arg-type]
+            return LinearBlock(cfg)
+        if isinstance(cfg, AttentionBlockConfig) or getattr(cfg, "type", None) == "attention":
+            if not isinstance(cfg, AttentionBlockConfig):
+                cfg = AttentionBlockConfig(**cfg.__dict__)  # type: ignore[arg-type]
+            return AttentionBlock(cfg)
+        if isinstance(cfg, RecurrentBlockConfig) or getattr(cfg, "type", None) == "recurrent":
+            if not isinstance(cfg, RecurrentBlockConfig):
+                cfg = RecurrentBlockConfig(**cfg.__dict__)  # type: ignore[arg-type]
+            return RecurrentBlock(cfg)
+        if isinstance(cfg, ConvolutionalBlockConfig) or getattr(cfg, "type", None) == "conv1d":
+            if not isinstance(cfg, ConvolutionalBlockConfig):
+                cfg = ConvolutionalBlockConfig(**cfg.__dict__)  # type: ignore[arg-type]
+            return ConvolutionalBlock(cfg)
+        if isinstance(cfg, VariationalBlockConfig) or getattr(cfg, "type", None) == "variational":
+            if not isinstance(cfg, VariationalBlockConfig):
+                cfg = VariationalBlockConfig(**cfg.__dict__)  # type: ignore[arg-type]
+            return VariationalBlock(cfg)
+        raise ValueError(f"Unsupported block type: {cfg}")
+    @staticmethod
+    def build_sequence(configs: Sequence[Union[BlockConfig, Dict[str, Any]]]) -> BlockSequence:
+        blocks: List[BaseBlock] = [BlockFactory.build_block(c) for c in configs]
+        return BlockSequence(blocks)
+__all__ = [
+    "BlockConfig",
+    "LinearBlockConfig",
+    "AttentionBlockConfig",
+    "RecurrentBlockConfig",
+    "ConvolutionalBlockConfig",
+    "VariationalBlockConfig",
+    "BaseBlock",
+    "ResidualBlock",
+    "LinearBlock",
+    "AttentionBlock",
+    "RecurrentBlock",
+    "ConvolutionalBlock",
+    "VariationalBlock",
+    "BlockSequence",
+    "BlockFactory",
+]

config.json CHANGED Viewed

@@ -10,7 +10,9 @@
   "autoencoder_type": "classic",
   "beta": 1.0,
   "bidirectional": true,
   "dropout_rate": 0.1,
   "flow_coupling_layers": 2,
   "hidden_dims": [
     16,

   "autoencoder_type": "classic",
   "beta": 1.0,
   "bidirectional": true,
+  "decoder_blocks": null,
   "dropout_rate": 0.1,
+  "encoder_blocks": null,
   "flow_coupling_layers": 2,
   "hidden_dims": [
     16,

configuration_autoencoder.py CHANGED Viewed

@@ -2,6 +2,9 @@
 Autoencoder configuration for Hugging Face Transformers.
 """
 from transformers import PretrainedConfig
 from typing import List, Optional
@@ -11,25 +14,114 @@ try:
 except Exception:  # pragma: no cover
     _pkg_version = None
 class AutoencoderConfig(PretrainedConfig):
     """
     Configuration class for Autoencoder models.
     This configuration class stores the configuration of an autoencoder model. It is used to instantiate
     an autoencoder model according to the specified arguments, defining the model architecture.
     Args:
         input_dim (int, optional): Dimensionality of the input data. Defaults to 784.
-        hidden_dims (List[int], optional): List of hidden layer dimensions for the encoder.
-            The decoder will use the reverse of this list. Defaults to [512, 256, 128].
         latent_dim (int, optional): Dimensionality of the latent space. Defaults to 64.
-        activation (str, optional): Activation function to use. Options: "relu", "tanh", "sigmoid",
-            "leaky_relu", "gelu", "swish", "silu", "elu", "prelu", "relu6", "hardtanh",
-            "hardsigmoid", "hardswish", "mish", "softplus", "softsign", "tanhshrink", "threshold".
-            Defaults to "relu".
-        dropout_rate (float, optional): Dropout rate for regularization. Defaults to 0.1.
-        use_batch_norm (bool, optional): Whether to use batch normalization. Defaults to True.
         tie_weights (bool, optional): Whether to tie encoder and decoder weights. Defaults to False.
         reconstruction_loss (str, optional): Type of reconstruction loss. Options: "mse", "bce", "l1",
             "huber", "smooth_l1", "kl_div", "cosine", "focal", "dice", "tversky", "ssim", "perceptual".
@@ -57,13 +149,15 @@ class AutoencoderConfig(PretrainedConfig):
         flow_coupling_layers (int, optional): Number of coupling layers for normalizing flows. Defaults to 4.
         **kwargs: Additional keyword arguments passed to the parent class.
     """
     model_type = "autoencoder"
     def __init__(
         self,
         input_dim: int = 784,
         hidden_dims: List[int] = None,
         latent_dim: int = 64,
         activation: str = "relu",
         dropout_rate: float = 0.1,
@@ -92,7 +186,7 @@ class AutoencoderConfig(PretrainedConfig):
         # Validate parameters
         if hidden_dims is None:
             hidden_dims = [512, 256, 128]
         # Extended activation functions
         valid_activations = [
             "relu", "tanh", "sigmoid", "leaky_relu", "gelu", "swish", "silu",
@@ -127,19 +221,19 @@ class AutoencoderConfig(PretrainedConfig):
             raise ValueError(
                 f"`rnn_type` must be one of {valid_rnn_types}, got {rnn_type}."
             )
         if not (0.0 <= dropout_rate <= 1.0):
             raise ValueError(f"`dropout_rate` must be between 0.0 and 1.0, got {dropout_rate}.")
         if input_dim <= 0:
             raise ValueError(f"`input_dim` must be positive, got {input_dim}.")
         if latent_dim <= 0:
             raise ValueError(f"`latent_dim` must be positive, got {latent_dim}.")
         if not all(dim > 0 for dim in hidden_dims):
             raise ValueError("All dimensions in `hidden_dims` must be positive.")
         if beta <= 0:
             raise ValueError(f"`beta` must be positive, got {beta}.")
@@ -174,10 +268,12 @@ class AutoencoderConfig(PretrainedConfig):
         if flow_coupling_layers <= 0:
             raise ValueError(f"`flow_coupling_layers` must be positive, got {flow_coupling_layers}.")
         # Set configuration attributes
         self.input_dim = input_dim
         self.hidden_dims = hidden_dims
         self.latent_dim = latent_dim
         self.activation = activation
         self.dropout_rate = dropout_rate
@@ -199,15 +295,20 @@ class AutoencoderConfig(PretrainedConfig):
         self.preprocessing_num_layers = preprocessing_num_layers
         self.learn_inverse_preprocessing = learn_inverse_preprocessing
         self.flow_coupling_layers = flow_coupling_layers
         # Call parent constructor
         super().__init__(**kwargs)
     @property
     def decoder_dims(self) -> List[int]:
         """Get decoder dimensions (reverse of encoder hidden dims)."""
         return list(reversed(self.hidden_dims))
     @property
     def is_variational(self) -> bool:
         """Check if this is a variational autoencoder."""

 Autoencoder configuration for Hugging Face Transformers.
 """
+from dataclasses import dataclass
+from typing import Union
 from transformers import PretrainedConfig
 from typing import List, Optional
 except Exception:  # pragma: no cover
     _pkg_version = None
+@dataclass
+class BlockConfig:
+    type: str
+@dataclass
+class LinearBlockConfig(BlockConfig):
+    input_dim: int
+    output_dim: int
+    activation: str = "relu"
+    normalization: Optional[str] = "batch"  # batch|layer|group|instance|none
+    dropout_rate: float = 0.0
+    use_residual: bool = False
+    residual_scale: float = 1.0
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", normalization: Optional[str] = "batch", dropout_rate: float = 0.0, use_residual: bool = False, residual_scale: float = 1.0):
+        super().__init__(type="linear")
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.activation = activation
+        self.normalization = normalization
+        self.dropout_rate = dropout_rate
+        self.use_residual = use_residual
+        self.residual_scale = residual_scale
+@dataclass
+class AttentionBlockConfig(BlockConfig):
+    input_dim: int
+    num_heads: int = 8
+    ffn_dim: Optional[int] = None
+    dropout_rate: float = 0.0
+    def __init__(self, input_dim: int, num_heads: int = 8, ffn_dim: Optional[int] = None, dropout_rate: float = 0.0):
+        super().__init__(type="attention")
+        self.input_dim = input_dim
+        self.num_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.dropout_rate = dropout_rate
+@dataclass
+class RecurrentBlockConfig(BlockConfig):
+    input_dim: int
+    hidden_size: int
+    num_layers: int = 1
+    rnn_type: str = "lstm"  # lstm|gru|rnn
+    bidirectional: bool = False
+    dropout_rate: float = 0.0
+    output_dim: Optional[int] = None  # if None, use hidden_size * directions
+    def __init__(self, input_dim: int, hidden_size: int, num_layers: int = 1, rnn_type: str = "lstm", bidirectional: bool = False, dropout_rate: float = 0.0, output_dim: Optional[int] = None):
+        super().__init__(type="recurrent")
+        self.input_dim = input_dim
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.rnn_type = rnn_type
+        self.bidirectional = bidirectional
+        self.dropout_rate = dropout_rate
+        self.output_dim = output_dim
+@dataclass
+class ConvolutionalBlockConfig(BlockConfig):
+    input_dim: int  # channels in (features)
+    output_dim: int  # channels out
+    kernel_size: int = 3
+    padding: Union[int, str] = "same"  # "same" or int
+    activation: str = "relu"
+    normalization: Optional[str] = "batch"
+    dropout_rate: float = 0.0
+    def __init__(self, input_dim: int, output_dim: int, kernel_size: int = 3, padding: Union[int, str] = "same", activation: str = "relu", normalization: Optional[str] = "batch", dropout_rate: float = 0.0):
+        super().__init__(type="conv1d")
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.activation = activation
+        self.normalization = normalization
+        self.dropout_rate = dropout_rate
+@dataclass
+class VariationalBlockConfig(BlockConfig):
+    input_dim: int
+    latent_dim: int
+    def __init__(self, input_dim: int, latent_dim: int):
+        super().__init__(type="variational")
+        self.input_dim = input_dim
+        self.latent_dim = latent_dim
 class AutoencoderConfig(PretrainedConfig):
     """
     Configuration class for Autoencoder models.
     This configuration class stores the configuration of an autoencoder model. It is used to instantiate
     an autoencoder model according to the specified arguments, defining the model architecture.
     Args:
         input_dim (int, optional): Dimensionality of the input data. Defaults to 784.
+        hidden_dims (List[int], optional): Legacy: List of hidden layer dims for simple MLP encoder.
+        encoder_blocks (List[dict], optional): New: List of block configs for encoder.
+        decoder_blocks (List[dict], optional): New: List of block configs for decoder.
         latent_dim (int, optional): Dimensionality of the latent space. Defaults to 64.
+        activation (str, optional): Default activation for Linear blocks. See supported list below.
+        dropout_rate (float, optional): Default dropout for Linear blocks. Defaults to 0.1.
+        use_batch_norm (bool, optional): Default normalization for Linear blocks (batch vs none). Defaults to True.
         tie_weights (bool, optional): Whether to tie encoder and decoder weights. Defaults to False.
         reconstruction_loss (str, optional): Type of reconstruction loss. Options: "mse", "bce", "l1",
             "huber", "smooth_l1", "kl_div", "cosine", "focal", "dice", "tversky", "ssim", "perceptual".
         flow_coupling_layers (int, optional): Number of coupling layers for normalizing flows. Defaults to 4.
         **kwargs: Additional keyword arguments passed to the parent class.
     """
     model_type = "autoencoder"
     def __init__(
         self,
         input_dim: int = 784,
         hidden_dims: List[int] = None,
+        encoder_blocks: Optional[List[dict]] = None,
+        decoder_blocks: Optional[List[dict]] = None,
         latent_dim: int = 64,
         activation: str = "relu",
         dropout_rate: float = 0.1,
         # Validate parameters
         if hidden_dims is None:
             hidden_dims = [512, 256, 128]
         # Extended activation functions
         valid_activations = [
             "relu", "tanh", "sigmoid", "leaky_relu", "gelu", "swish", "silu",
             raise ValueError(
                 f"`rnn_type` must be one of {valid_rnn_types}, got {rnn_type}."
             )
         if not (0.0 <= dropout_rate <= 1.0):
             raise ValueError(f"`dropout_rate` must be between 0.0 and 1.0, got {dropout_rate}.")
         if input_dim <= 0:
             raise ValueError(f"`input_dim` must be positive, got {input_dim}.")
         if latent_dim <= 0:
             raise ValueError(f"`latent_dim` must be positive, got {latent_dim}.")
         if not all(dim > 0 for dim in hidden_dims):
             raise ValueError("All dimensions in `hidden_dims` must be positive.")
         if beta <= 0:
             raise ValueError(f"`beta` must be positive, got {beta}.")
         if flow_coupling_layers <= 0:
             raise ValueError(f"`flow_coupling_layers` must be positive, got {flow_coupling_layers}.")
         # Set configuration attributes
         self.input_dim = input_dim
         self.hidden_dims = hidden_dims
+        self.encoder_blocks = encoder_blocks
+        self.decoder_blocks = decoder_blocks
         self.latent_dim = latent_dim
         self.activation = activation
         self.dropout_rate = dropout_rate
         self.preprocessing_num_layers = preprocessing_num_layers
         self.learn_inverse_preprocessing = learn_inverse_preprocessing
         self.flow_coupling_layers = flow_coupling_layers
         # Call parent constructor
         super().__init__(**kwargs)
     @property
     def decoder_dims(self) -> List[int]:
         """Get decoder dimensions (reverse of encoder hidden dims)."""
         return list(reversed(self.hidden_dims))
+    @property
+    def has_block_lists(self) -> bool:
+        """Whether explicit encoder/decoder block configs are provided."""
+        return (self.encoder_blocks is not None) or (self.decoder_blocks is not None)
     @property
     def is_variational(self) -> bool:
         """Check if this is a variational autoencoder."""

model.safetensors CHANGED Viewed

Binary files a/model.safetensors and b/model.safetensors differ

modeling_autoencoder.py CHANGED Viewed

@@ -8,6 +8,7 @@ import torch.nn.functional as F
 from typing import Optional, Tuple, Union, Dict, Any, List
 from dataclasses import dataclass
 import random
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutput
@@ -18,653 +19,41 @@ try:
 except Exception:
     from configuration_autoencoder import AutoencoderConfig  # local usage
-class NeuralScaler(nn.Module):
-    """Learnable alternative to StandardScaler using neural networks."""
-    def __init__(self, config: AutoencoderConfig):
-        super().__init__()
-        self.config = config
-        input_dim = config.input_dim
-        hidden_dim = config.preprocessing_hidden_dim
-        # Networks to learn data-dependent statistics
-        self.mean_estimator = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, input_dim)
-        )
-        self.std_estimator = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, input_dim),
-            nn.Softplus()  # Ensure positive standard deviation
-        )
-        # Learnable affine transformation parameters
-        self.weight = nn.Parameter(torch.ones(input_dim))
-        self.bias = nn.Parameter(torch.zeros(input_dim))
-        # Running statistics for inference (like BatchNorm)
-        self.register_buffer('running_mean', torch.zeros(input_dim))
-        self.register_buffer('running_std', torch.ones(input_dim))
-        self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
-        # Momentum for running statistics
-        self.momentum = 0.1
-    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Forward pass through neural scaler.
-        Args:
-            x: Input tensor (2D or 3D)
-            inverse: Whether to apply inverse transformation
-        Returns:
-            Tuple of (transformed_tensor, regularization_loss)
-        """
-        if inverse:
-            return self._inverse_transform(x)
-        # Handle both 2D and 3D tensors
-        original_shape = x.shape
-        if x.dim() == 3:
-            # Reshape (batch, seq, features) -> (batch*seq, features)
-            x = x.view(-1, x.size(-1))
-        if self.training:
-            # Training mode: learn statistics from current batch
-            batch_mean = x.mean(dim=0, keepdim=True)
-            batch_std = x.std(dim=0, keepdim=True)
-            # Learn data-dependent adjustments
-            learned_mean_adj = self.mean_estimator(batch_mean)
-            learned_std_adj = self.std_estimator(batch_std)
-            # Combine batch statistics with learned adjustments
-            effective_mean = batch_mean + learned_mean_adj
-            effective_std = batch_std + learned_std_adj + 1e-8
-            # Update running statistics
-            with torch.no_grad():
-                self.num_batches_tracked += 1
-                if self.num_batches_tracked == 1:
-                    self.running_mean.copy_(batch_mean.squeeze())
-                    self.running_std.copy_(batch_std.squeeze())
-                else:
-                    self.running_mean.mul_(1 - self.momentum).add_(batch_mean.squeeze(), alpha=self.momentum)
-                    self.running_std.mul_(1 - self.momentum).add_(batch_std.squeeze(), alpha=self.momentum)
-        else:
-            # Inference mode: use running statistics
-            effective_mean = self.running_mean.unsqueeze(0)
-            effective_std = self.running_std.unsqueeze(0) + 1e-8
-        # Normalize
-        normalized = (x - effective_mean) / effective_std
-        # Apply learnable affine transformation
-        transformed = normalized * self.weight + self.bias
-        # Reshape back to original shape if needed
-        if len(original_shape) == 3:
-            transformed = transformed.view(original_shape)
-        # Regularization loss to encourage meaningful learning
-        reg_loss = 0.01 * (self.weight.var() + self.bias.var())
-        return transformed, reg_loss
-    def _inverse_transform(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Apply inverse transformation to get back original scale."""
-        if not self.config.learn_inverse_preprocessing:
-            return x, torch.tensor(0.0, device=x.device)
-        # Handle both 2D and 3D tensors
-        original_shape = x.shape
-        if x.dim() == 3:
-            # Reshape (batch, seq, features) -> (batch*seq, features)
-            x = x.view(-1, x.size(-1))
-        # Reverse affine transformation
-        x = (x - self.bias) / (self.weight + 1e-8)
-        # Reverse normalization using running statistics
-        effective_mean = self.running_mean.unsqueeze(0)
-        effective_std = self.running_std.unsqueeze(0) + 1e-8
-        x = x * effective_std + effective_mean
-        # Reshape back to original shape if needed
-        if len(original_shape) == 3:
-            x = x.view(original_shape)
-        return x, torch.tensor(0.0, device=x.device)
-class LearnableMinMaxScaler(nn.Module):
-    """Learnable MinMax scaler that adapts bounds during training.
-    Scales features to [0, 1] using batch min/range with learnable adjustments and
-    a learnable affine transform. Supports 2D (B, F) and 3D (B, T, F) inputs.
-    """
-    def __init__(self, config: AutoencoderConfig):
-        super().__init__()
-        self.config = config
-        input_dim = config.input_dim
-        hidden_dim = config.preprocessing_hidden_dim
-        # Networks to learn adjustments to batch min and range
-        self.min_estimator = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, input_dim),
-        )
-        self.range_estimator = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, input_dim),
-            nn.Softplus(),  # Ensure positive adjustment to range
-        )
-        # Learnable affine transformation parameters
-        self.weight = nn.Parameter(torch.ones(input_dim))
-        self.bias = nn.Parameter(torch.zeros(input_dim))
-        # Running statistics for inference
-        self.register_buffer("running_min", torch.zeros(input_dim))
-        self.register_buffer("running_range", torch.ones(input_dim))
-        self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.long))
-        self.momentum = 0.1
-    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
-        if inverse:
-            return self._inverse_transform(x)
-        original_shape = x.shape
-        if x.dim() == 3:
-            x = x.view(-1, x.size(-1))
-        eps = 1e-8
-        if self.training:
-            batch_min = x.min(dim=0, keepdim=True).values
-            batch_max = x.max(dim=0, keepdim=True).values
-            batch_range = (batch_max - batch_min).clamp_min(eps)
-            # Learn adjustments
-            learned_min_adj = self.min_estimator(batch_min)
-            learned_range_adj = self.range_estimator(batch_range)
-            effective_min = batch_min + learned_min_adj
-            effective_range = batch_range + learned_range_adj + eps
-            # Update running stats with raw batch min/range for stable inversion
-            with torch.no_grad():
-                self.num_batches_tracked += 1
-                if self.num_batches_tracked == 1:
-                    self.running_min.copy_(batch_min.squeeze())
-                    self.running_range.copy_(batch_range.squeeze())
-                else:
-                    self.running_min.mul_(1 - self.momentum).add_(batch_min.squeeze(), alpha=self.momentum)
-                    self.running_range.mul_(1 - self.momentum).add_(batch_range.squeeze(), alpha=self.momentum)
-        else:
-            effective_min = self.running_min.unsqueeze(0)
-            effective_range = self.running_range.unsqueeze(0)
-        # Scale to [0, 1]
-        scaled = (x - effective_min) / effective_range
-        # Learnable affine transform
-        transformed = scaled * self.weight + self.bias
-        if len(original_shape) == 3:
-            transformed = transformed.view(original_shape)
-        # Regularization: encourage non-degenerate range and modest affine params
-        reg_loss = 0.01 * (self.weight.var() + self.bias.var())
-        if self.training:
-            reg_loss = reg_loss + 0.001 * (1.0 / effective_range.clamp_min(1e-3)).mean()
-        return transformed, reg_loss
-    def _inverse_transform(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        if not self.config.learn_inverse_preprocessing:
-            return x, torch.tensor(0.0, device=x.device)
-        original_shape = x.shape
-        if x.dim() == 3:
-            x = x.view(-1, x.size(-1))
-        # Reverse affine
-        x = (x - self.bias) / (self.weight + 1e-8)
-        # Reverse MinMax using running stats
-        x = x * self.running_range.unsqueeze(0) + self.running_min.unsqueeze(0)
-        if len(original_shape) == 3:
-            x = x.view(original_shape)
-        return x, torch.tensor(0.0, device=x.device)
-class LearnableRobustScaler(nn.Module):
-    """Learnable Robust scaler using median and IQR with learnable adjustments.
-    Normalizes as (x - median) / IQR with learnable adjustments and an affine head.
-    Supports 2D (B, F) and 3D (B, T, F) inputs.
-    """
-    def __init__(self, config: AutoencoderConfig):
-        super().__init__()
-        self.config = config
-        input_dim = config.input_dim
-        hidden_dim = config.preprocessing_hidden_dim
-        self.median_estimator = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, input_dim),
-        )
-        self.iqr_estimator = nn.Sequential(
-            nn.Linear(input_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, input_dim),
-            nn.Softplus(),  # Ensure positive IQR adjustment
-        )
-        self.weight = nn.Parameter(torch.ones(input_dim))
-        self.bias = nn.Parameter(torch.zeros(input_dim))
-        self.register_buffer("running_median", torch.zeros(input_dim))
-        self.register_buffer("running_iqr", torch.ones(input_dim))
-        self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.long))
-        self.momentum = 0.1
-    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
-        if inverse:
-            return self._inverse_transform(x)
-        original_shape = x.shape
-        if x.dim() == 3:
-            x = x.view(-1, x.size(-1))
-        eps = 1e-8
-        if self.training:
-            qs = torch.quantile(x, torch.tensor([0.25, 0.5, 0.75], device=x.device), dim=0)
-            q25, med, q75 = qs[0:1, :], qs[1:2, :], qs[2:3, :]
-            iqr = (q75 - q25).clamp_min(eps)
-            learned_med_adj = self.median_estimator(med)
-            learned_iqr_adj = self.iqr_estimator(iqr)
-            effective_median = med + learned_med_adj
-            effective_iqr = iqr + learned_iqr_adj + eps
-            with torch.no_grad():
-                self.num_batches_tracked += 1
-                if self.num_batches_tracked == 1:
-                    self.running_median.copy_(med.squeeze())
-                    self.running_iqr.copy_(iqr.squeeze())
-                else:
-                    self.running_median.mul_(1 - self.momentum).add_(med.squeeze(), alpha=self.momentum)
-                    self.running_iqr.mul_(1 - self.momentum).add_(iqr.squeeze(), alpha=self.momentum)
-        else:
-            effective_median = self.running_median.unsqueeze(0)
-            effective_iqr = self.running_iqr.unsqueeze(0)
-        normalized = (x - effective_median) / effective_iqr
-        transformed = normalized * self.weight + self.bias
-        if len(original_shape) == 3:
-            transformed = transformed.view(original_shape)
-        reg_loss = 0.01 * (self.weight.var() + self.bias.var())
-        if self.training:
-            reg_loss = reg_loss + 0.001 * (1.0 / effective_iqr.clamp_min(1e-3)).mean()
-        return transformed, reg_loss
-    def _inverse_transform(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        if not self.config.learn_inverse_preprocessing:
-            return x, torch.tensor(0.0, device=x.device)
-        original_shape = x.shape
-        if x.dim() == 3:
-            x = x.view(-1, x.size(-1))
-        x = (x - self.bias) / (self.weight + 1e-8)
-        x = x * self.running_iqr.unsqueeze(0) + self.running_median.unsqueeze(0)
-        if len(original_shape) == 3:
-            x = x.view(original_shape)
-        return x, torch.tensor(0.0, device=x.device)
-class LearnableYeoJohnsonPreprocessor(nn.Module):
-    """Learnable Yeo-Johnson power transform with per-feature λ and affine head.
-    Applies Yeo-Johnson transform elementwise with learnable lambda per feature,
-    followed by standardization and a learnable affine transform. Supports 2D and 3D inputs.
-    """
-    def __init__(self, config: AutoencoderConfig):
-        super().__init__()
-        self.config = config
-        input_dim = config.input_dim
-        # Learnable lambda per feature (unconstrained). Initialize around 1.0
-        self.lmbda = nn.Parameter(torch.ones(input_dim))
-        # Learnable affine parameters after standardization
-        self.weight = nn.Parameter(torch.ones(input_dim))
-        self.bias = nn.Parameter(torch.zeros(input_dim))
-        # Running stats for transformed data
-        self.register_buffer("running_mean", torch.zeros(input_dim))
-        self.register_buffer("running_std", torch.ones(input_dim))
-        self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.long))
-        self.momentum = 0.1
-    def _yeo_johnson(self, x: torch.Tensor, lmbda: torch.Tensor) -> torch.Tensor:
-        eps = 1e-6
-        lmbda = lmbda.unsqueeze(0)  # broadcast over batch
-        pos = x >= 0
-        # For x >= 0
-        if_part = torch.where(
-            torch.abs(lmbda) > eps,
-            ((x + 1.0).clamp_min(eps) ** lmbda - 1.0) / lmbda,
-            torch.log((x + 1.0).clamp_min(eps)),
-        )
-        # For x < 0
-        two_minus_lambda = 2.0 - lmbda
-        else_part = torch.where(
-            torch.abs(two_minus_lambda) > eps,
-            -(((1.0 - x).clamp_min(eps)) ** two_minus_lambda - 1.0) / two_minus_lambda,
-            -torch.log((1.0 - x).clamp_min(eps)),
-        )
-        return torch.where(pos, if_part, else_part)
-    def _yeo_johnson_inverse(self, y: torch.Tensor, lmbda: torch.Tensor) -> torch.Tensor:
-        eps = 1e-6
-        lmbda = lmbda.unsqueeze(0)
-        pos = y >= 0
-        # Inverse for y >= 0
-        x_pos = torch.where(
-            torch.abs(lmbda) > eps,
-            (y * lmbda + 1.0).clamp_min(eps) ** (1.0 / lmbda) - 1.0,
-            torch.exp(y) - 1.0,
-        )
-        # Inverse for y < 0
-        two_minus_lambda = 2.0 - lmbda
-        x_neg = torch.where(
-            torch.abs(two_minus_lambda) > eps,
-            1.0 - (1.0 - y * two_minus_lambda).clamp_min(eps) ** (1.0 / two_minus_lambda),
-            1.0 - torch.exp(-y),
-        )
-        return torch.where(pos, x_pos, x_neg)
-    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
-        if inverse:
-            return self._inverse_transform(x)
-        orig_shape = x.shape
-        if x.dim() == 3:
-            x = x.view(-1, x.size(-1))
-        # Apply Yeo-Johnson
-        y = self._yeo_johnson(x, self.lmbda)
-        # Batch stats and running stats on transformed data
-        if self.training:
-            batch_mean = y.mean(dim=0, keepdim=True)
-            batch_std = y.std(dim=0, keepdim=True).clamp_min(1e-6)
-            with torch.no_grad():
-                self.num_batches_tracked += 1
-                if self.num_batches_tracked == 1:
-                    self.running_mean.copy_(batch_mean.squeeze())
-                    self.running_std.copy_(batch_std.squeeze())
-                else:
-                    self.running_mean.mul_(1 - self.momentum).add_(batch_mean.squeeze(), alpha=self.momentum)
-                    self.running_std.mul_(1 - self.momentum).add_(batch_std.squeeze(), alpha=self.momentum)
-            mean = batch_mean
-            std = batch_std
-        else:
-            mean = self.running_mean.unsqueeze(0)
-            std = self.running_std.unsqueeze(0)
-        y_norm = (y - mean) / std
-        out = y_norm * self.weight + self.bias
-        if len(orig_shape) == 3:
-            out = out.view(orig_shape)
-        # Regularize lambda to avoid extreme values; encourage identity around 1
-        reg = 0.001 * (self.lmbda - 1.0).pow(2).mean() + 0.01 * (self.weight.var() + self.bias.var())
-        return out, reg
-    def _inverse_transform(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        if not self.config.learn_inverse_preprocessing:
-            return x, torch.tensor(0.0, device=x.device)
-        orig_shape = x.shape
-        if x.dim() == 3:
-            x = x.view(-1, x.size(-1))
-        # Reverse affine and normalization with running stats
-        y = (x - self.bias) / (self.weight + 1e-8)
-        y = y * self.running_std.unsqueeze(0) + self.running_mean.unsqueeze(0)
-        # Inverse Yeo-Johnson
-        out = self._yeo_johnson_inverse(y, self.lmbda)
-        if len(orig_shape) == 3:
-            out = out.view(orig_shape)
-        return out, torch.tensor(0.0, device=x.device)
-class CouplingLayer(nn.Module):
-    """Coupling layer for normalizing flows."""
-    def __init__(self, input_dim: int, hidden_dim: int = 64, mask_type: str = "alternating"):
-        super().__init__()
-        self.input_dim = input_dim
-        self.hidden_dim = hidden_dim
-        # Create mask for coupling
-        if mask_type == "alternating":
-            self.register_buffer('mask', torch.arange(input_dim) % 2)
-        elif mask_type == "half":
-            mask = torch.zeros(input_dim)
-            mask[:input_dim // 2] = 1
-            self.register_buffer('mask', mask)
-        else:
-            raise ValueError(f"Unknown mask type: {mask_type}")
-        # Scale and translation networks
-        masked_dim = int(self.mask.sum().item())
-        unmasked_dim = input_dim - masked_dim
-        self.scale_net = nn.Sequential(
-            nn.Linear(masked_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, unmasked_dim),
-            nn.Tanh()  # Bounded output for stability
-        )
-        self.translate_net = nn.Sequential(
-            nn.Linear(masked_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, unmasked_dim)
-        )
-    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Forward pass through coupling layer.
-        Args:
-            x: Input tensor
-            inverse: Whether to apply inverse transformation
-        Returns:
-            Tuple of (transformed_tensor, log_determinant)
-        """
-        mask = self.mask.bool()
-        x_masked = x[:, mask]
-        x_unmasked = x[:, ~mask]
-        # Compute scale and translation
-        s = self.scale_net(x_masked)
-        t = self.translate_net(x_masked)
-        if not inverse:
-            # Forward transformation
-            y_unmasked = x_unmasked * torch.exp(s) + t
-            log_det = s.sum(dim=1)
-        else:
-            # Inverse transformation
-            y_unmasked = (x_unmasked - t) * torch.exp(-s)
-            log_det = -s.sum(dim=1)
-        # Reconstruct output
-        y = torch.zeros_like(x)
-        y[:, mask] = x_masked
-        y[:, ~mask] = y_unmasked
-        return y, log_det
-class NormalizingFlowPreprocessor(nn.Module):
-    """Normalizing flow for learnable data preprocessing."""
-    def __init__(self, config: AutoencoderConfig):
-        super().__init__()
-        self.config = config
-        input_dim = config.input_dim
-        hidden_dim = config.preprocessing_hidden_dim
-        num_layers = config.flow_coupling_layers
-        # Create coupling layers with alternating masks
-        self.layers = nn.ModuleList()
-        for i in range(num_layers):
-            mask_type = "alternating" if i % 2 == 0 else "half"
-            self.layers.append(CouplingLayer(input_dim, hidden_dim, mask_type))
-        # Optional: Add batch normalization between layers
-        if config.use_batch_norm:
-            self.batch_norms = nn.ModuleList([
-                nn.BatchNorm1d(input_dim) for _ in range(num_layers - 1)
-            ])
-        else:
-            self.batch_norms = None
-    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Forward pass through normalizing flow.
-        Args:
-            x: Input tensor (2D or 3D)
-            inverse: Whether to apply inverse transformation
-        Returns:
-            Tuple of (transformed_tensor, total_log_determinant)
-        """
-        # Handle both 2D and 3D tensors
-        original_shape = x.shape
-        if x.dim() == 3:
-            # Reshape (batch, seq, features) -> (batch*seq, features)
-            x = x.view(-1, x.size(-1))
-        log_det_total = torch.zeros(x.size(0), device=x.device)
-        if not inverse:
-            # Forward pass
-            for i, layer in enumerate(self.layers):
-                x, log_det = layer(x, inverse=False)
-                log_det_total += log_det
-                # Apply batch normalization (except for last layer)
-                if self.batch_norms and i < len(self.layers) - 1:
-                    x = self.batch_norms[i](x)
-        else:
-            # Inverse pass
-            for i, layer in enumerate(reversed(self.layers)):
-                # Reverse batch normalization (except for first layer in reverse)
-                if self.batch_norms and i > 0:
-                    # Note: This is approximate inverse of batch norm
-                    bn_idx = len(self.layers) - 1 - i
-                    x = self.batch_norms[bn_idx](x)
-                x, log_det = layer(x, inverse=True)
-                log_det_total += log_det
-        # Reshape back to original shape if needed
-        if len(original_shape) == 3:
-            x = x.view(original_shape)
-        # Convert log determinant to regularization loss
-        # Encourage the flow to preserve information (log_det close to 0)
-        reg_loss = 0.01 * log_det_total.abs().mean()
-        return x, reg_loss
-class LearnablePreprocessor(nn.Module):
-    """Unified interface for learnable preprocessing methods."""
-    def __init__(self, config: AutoencoderConfig):
-        super().__init__()
-        self.config = config
-        if not config.has_preprocessing:
-            self.preprocessor = nn.Identity()
-        elif config.is_neural_scaler:
-            self.preprocessor = NeuralScaler(config)
-        elif config.is_normalizing_flow:
-            self.preprocessor = NormalizingFlowPreprocessor(config)
-        elif getattr(config, "is_minmax_scaler", False):
-            self.preprocessor = LearnableMinMaxScaler(config)
-        elif getattr(config, "is_robust_scaler", False):
-            self.preprocessor = LearnableRobustScaler(config)
-        elif getattr(config, "is_yeo_johnson", False):
-            self.preprocessor = LearnableYeoJohnsonPreprocessor(config)
-        else:
-            raise ValueError(f"Unknown preprocessing type: {config.preprocessing_type}")
-    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Apply preprocessing transformation.
-        Args:
-            x: Input tensor
-            inverse: Whether to apply inverse transformation
-        Returns:
-            Tuple of (transformed_tensor, regularization_loss)
-        """
-        if isinstance(self.preprocessor, nn.Identity):
-            return x, torch.tensor(0.0, device=x.device)
-        return self.preprocessor(x, inverse=inverse)
 @dataclass
@@ -741,29 +130,6 @@ class AutoencoderEncoder(nn.Module):
             # Standard encoder output
             self.fc_out = nn.Linear(input_dim, config.latent_dim)
-    def _get_activation(self, activation: str) -> nn.Module:
-        """Get activation function by name."""
-        activations = {
-            "relu": nn.ReLU(),
-            "tanh": nn.Tanh(),
-            "sigmoid": nn.Sigmoid(),
-            "leaky_relu": nn.LeakyReLU(),
-            "gelu": nn.GELU(),
-            "swish": nn.SiLU(),
-            "silu": nn.SiLU(),
-            "elu": nn.ELU(),
-            "prelu": nn.PReLU(),
-            "relu6": nn.ReLU6(),
-            "hardtanh": nn.Hardtanh(),
-            "hardsigmoid": nn.Hardsigmoid(),
-            "hardswish": nn.Hardswish(),
-            "mish": nn.Mish(),
-            "softplus": nn.Softplus(),
-            "softsign": nn.Softsign(),
-            "tanhshrink": nn.Tanhshrink(),
-            "threshold": nn.Threshold(threshold=0.1, value=0),
-        }
-        return activations[activation]
     def forward(self, x: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """Forward pass through encoder."""
@@ -820,7 +186,7 @@ class AutoencoderDecoder(nn.Module):
                 if config.use_batch_norm:
                     layers.append(nn.BatchNorm1d(hidden_dim))
-                layers.append(self._get_activation(config.activation))
                 if config.dropout_rate > 0:
                     layers.append(nn.Dropout(config.dropout_rate))
@@ -833,29 +199,6 @@ class AutoencoderDecoder(nn.Module):
         self.decoder = nn.Sequential(*layers)
-    def _get_activation(self, activation: str) -> nn.Module:
-        """Get activation function by name."""
-        activations = {
-            "relu": nn.ReLU(),
-            "tanh": nn.Tanh(),
-            "sigmoid": nn.Sigmoid(),
-            "leaky_relu": nn.LeakyReLU(),
-            "gelu": nn.GELU(),
-            "swish": nn.SiLU(),
-            "silu": nn.SiLU(),
-            "elu": nn.ELU(),
-            "prelu": nn.PReLU(),
-            "relu6": nn.ReLU6(),
-            "hardtanh": nn.Hardtanh(),
-            "hardsigmoid": nn.Hardsigmoid(),
-            "hardswish": nn.Hardswish(),
-            "mish": nn.Mish(),
-            "softplus": nn.Softplus(),
-            "softsign": nn.Softsign(),
-            "tanhshrink": nn.Tanhshrink(),
-            "threshold": nn.Threshold(threshold=0.1, value=0),
-        }
-        return activations[activation]
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward pass through decoder."""
@@ -1111,21 +454,75 @@ class AutoencoderModel(PreTrainedModel):
         super().__init__(config)
         self.config = config
-        # Initialize learnable preprocessing
         if config.has_preprocessing:
-            self.preprocessor = LearnablePreprocessor(config)
         else:
-            self.preprocessor = None
-        # Initialize encoder and decoder based on type
-        if config.is_recurrent:
-            self.encoder = RecurrentEncoder(config)
-            self.decoder = RecurrentDecoder(config)
         else:
-            self.encoder = AutoencoderEncoder(config)
-            self.decoder = AutoencoderDecoder(config)
-        # Tie weights if specified
         if config.tie_weights:
             self._tie_weights()
@@ -1173,62 +570,37 @@ class AutoencoderModel(PreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # Apply learnable preprocessing
         preprocessing_loss = torch.tensor(0.0, device=input_values.device)
-        if self.preprocessor is not None:
-            input_values, preprocessing_loss = self.preprocessor(input_values, inverse=False)
-        # Handle different autoencoder types
-        if self.config.is_recurrent:
-            # Recurrent autoencoder
-            if sequence_lengths is not None:
-                encoder_output = self.encoder(input_values, sequence_lengths)
-            else:
-                encoder_output = self.encoder(input_values)
-            if self.config.is_variational:
-                latent, mu, logvar = encoder_output
-                self._mu = mu
-                self._logvar = logvar
-            else:
-                latent = encoder_output
-                self._mu = None
-                self._logvar = None
-            # Determine target length for decoder
-            if target_length is None:
-                if self.config.sequence_length is not None:
-                    target_length = self.config.sequence_length
-                else:
-                    target_length = input_values.size(1)  # Use input sequence length
-            # Decode latent back to sequence space
-            reconstructed = self.decoder(latent, target_length, input_values if self.training else None)
         else:
-            # Standard autoencoder
-            encoder_output = self.encoder(input_values)
-            if self.config.is_variational:
-                latent, mu, logvar = encoder_output
-                self._mu = mu
-                self._logvar = logvar
-            else:
-                latent = encoder_output
-                self._mu = None
-                self._logvar = None
-            # Decode latent back to input space
-            reconstructed = self.decoder(latent)
-        # Apply inverse preprocessing to reconstruction
-        if self.preprocessor is not None and self.config.learn_inverse_preprocessing:
-            reconstructed, inverse_loss = self.preprocessor(reconstructed, inverse=True)
-            preprocessing_loss += inverse_loss
         hidden_states = None
         if output_hidden_states:
             if self.config.is_variational:
-                hidden_states = (latent, mu, logvar)
             else:
                 hidden_states = (latent,)
@@ -1263,6 +635,8 @@ class AutoencoderForReconstruction(PreTrainedModel):
         # Initialize weights
         self.post_init()
     def get_input_embeddings(self):
         """Get input embeddings."""
         return self.autoencoder.get_input_embeddings()

 from typing import Optional, Tuple, Union, Dict, Any, List
 from dataclasses import dataclass
 import random
+import re
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutput
 except Exception:
     from configuration_autoencoder import AutoencoderConfig  # local usage
+# Block-based architecture components
+try:
+    from .blocks import (
+        BlockFactory,
+        BlockSequence,
+        LinearBlockConfig,
+        AttentionBlockConfig,
+        RecurrentBlockConfig,
+        ConvolutionalBlockConfig,
+        VariationalBlockConfig,
+        VariationalBlock,
+    )  # when in package
+except Exception:
+    from blocks import (
+        BlockFactory,
+        BlockSequence,
+        LinearBlockConfig,
+        AttentionBlockConfig,
+        RecurrentBlockConfig,
+        ConvolutionalBlockConfig,
+        VariationalBlockConfig,
+        VariationalBlock,
+    )  # local usage
+# Shared utilities
+try:
+    from .utils import _get_activation
+except Exception:
+    from utils import _get_activation
+# Preprocessing components
+try:
+    from .preprocessing import PreprocessingBlock  # when in package
+except Exception:
+    from preprocessing import PreprocessingBlock  # local usage
 @dataclass
             # Standard encoder output
             self.fc_out = nn.Linear(input_dim, config.latent_dim)
     def forward(self, x: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """Forward pass through encoder."""
                 if config.use_batch_norm:
                     layers.append(nn.BatchNorm1d(hidden_dim))
+                layers.append(_get_activation(config.activation))
                 if config.dropout_rate > 0:
                     layers.append(nn.Dropout(config.dropout_rate))
         self.decoder = nn.Sequential(*layers)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward pass through decoder."""
         super().__init__(config)
         self.config = config
+        # Initialize learnable preprocessing as a single forward block only
         if config.has_preprocessing:
+            self.pre_block = PreprocessingBlock(config, inverse=False)
+        else:
+            self.pre_block = None
+        # Build block-based encoder/decoder sequences (breaking change refactor)
+        norm = "batch" if config.use_batch_norm else "none"
+        def default_linear_sequence(in_dim: int, dims: List[int], activation: str, normalization: str, dropout: float) -> List[LinearBlockConfig]:
+            cfgs: List[LinearBlockConfig] = []
+            prev = in_dim
+            for h in dims:
+                cfgs.append(
+                    LinearBlockConfig(
+                        input_dim=prev,
+                        output_dim=h,
+                        activation=activation,
+                        normalization=normalization,
+                        dropout_rate=dropout,
+                        use_residual=False,
+                    )
+                )
+                prev = h
+            return cfgs
+        # Encoder: use explicit block list if provided, else hidden_dims default
+        if getattr(config, "encoder_blocks", None):
+            enc_cfgs = config.encoder_blocks
+            # Compute enc_out_dim from last block's output_dim if linear/conv, else assume input_dim
+            last_out = None
+            for b in enc_cfgs:
+                if isinstance(b, dict):
+                    last_out = b.get("output_dim", last_out)
+                else:
+                    last_out = getattr(b, "output_dim", last_out)
+            enc_out_dim = last_out or (config.hidden_dims[-1] if config.hidden_dims else config.input_dim)
         else:
+            enc_cfgs = default_linear_sequence(config.input_dim, config.hidden_dims, config.activation, norm, config.dropout_rate)
+            enc_out_dim = config.hidden_dims[-1] if config.hidden_dims else config.input_dim
+        base_encoder_seq: BlockSequence = BlockFactory.build_sequence(enc_cfgs) if len(enc_cfgs) > 0 else BlockSequence([])
+        # Do not inject pre_block into encoder sequence; apply it explicitly in forward
+        self.encoder_seq = base_encoder_seq
+        # Project to latent
+        if config.is_variational:
+            self.fc_mu = nn.Linear(enc_out_dim, config.latent_dim)
+            self.fc_logvar = nn.Linear(enc_out_dim, config.latent_dim)
+            self.to_latent = None
         else:
+            self.fc_mu = None
+            self.fc_logvar = None
+            self.to_latent = nn.Linear(enc_out_dim, config.latent_dim)
+        # Decoder: use explicit block list if provided, else default MLP back to input
+        if getattr(config, "decoder_blocks", None):
+            dec_cfgs = config.decoder_blocks
+        else:
+            dec_dims = config.decoder_dims + [config.input_dim]
+            dec_cfgs = default_linear_sequence(config.latent_dim, dec_dims, config.activation, norm, config.dropout_rate)
+            # For final projection to input_dim: identity activation and no norm/dropout
+            if len(dec_cfgs) > 0:
+                last = dec_cfgs[-1]
+                last.activation = "identity"
+                last.normalization = "none"
+                last.dropout_rate = 0.0
+        self.decoder_seq: BlockSequence = BlockFactory.build_sequence(dec_cfgs) if len(dec_cfgs) > 0 else BlockSequence([])
+        # Tie weights if specified (no-op for now)
         if config.tie_weights:
             self._tie_weights()
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Apply learnable preprocessing via block (forward only)
+        if self.pre_block is not None:
+            input_values = self.pre_block(input_values)
         preprocessing_loss = torch.tensor(0.0, device=input_values.device)
+        # Block-based forward
+        # Encode through block sequence
+        enc_out = self.encoder_seq(input_values)
+        # Sample or project to latent
+        if self.config.is_variational:
+            # Use VariationalBlock to encapsulate VAE behavior
+            self._variational = getattr(self, '_variational', None)
+            if self._variational is None:
+                self._variational = VariationalBlock(VariationalBlockConfig(input_dim=enc_out.shape[-1], latent_dim=self.config.latent_dim)).to(enc_out.device)
+            latent = self._variational(enc_out, training=self.training)
+            self._mu = self._variational._mu
+            self._logvar = self._variational._logvar
         else:
+            latent = self.to_latent(enc_out) if self.to_latent is not None else enc_out
+            self._mu, self._logvar = None, None
+        # Decode back to input space
+        reconstructed = self.decoder_seq(latent)
         hidden_states = None
         if output_hidden_states:
             if self.config.is_variational:
+                hidden_states = (latent, getattr(self, '_mu', None), getattr(self, '_logvar', None))
             else:
                 hidden_states = (latent,)
         # Initialize weights
         self.post_init()
     def get_input_embeddings(self):
         """Get input embeddings."""
         return self.autoencoder.get_input_embeddings()

preprocessing.py ADDED Viewed

	@@ -0,0 +1,457 @@

+# flake8: noqa
+"""
+Learnable preprocessing components for the block-based autoencoder.
+Extracted from modeling_autoencoder.py to a dedicated module.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from typing import Tuple
+try:
+    from .blocks import BaseBlock
+except Exception:
+    from blocks import BaseBlock
+import torch.nn as nn
+try:
+    from .configuration_autoencoder import AutoencoderConfig  # when loaded via HF dynamic module
+except Exception:
+    from configuration_autoencoder import AutoencoderConfig  # local usage
+class NeuralScaler(nn.Module):
+    """Learnable alternative to StandardScaler using neural networks."""
+    def __init__(self, config: AutoencoderConfig):
+        super().__init__()
+        self.config = config
+        input_dim = config.input_dim
+        hidden_dim = config.preprocessing_hidden_dim
+        self.mean_estimator = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, input_dim)
+        )
+        self.std_estimator = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, input_dim), nn.Softplus()
+        )
+        self.weight = nn.Parameter(torch.ones(input_dim))
+        self.bias = nn.Parameter(torch.zeros(input_dim))
+        self.register_buffer("running_mean", torch.zeros(input_dim))
+        self.register_buffer("running_std", torch.ones(input_dim))
+        self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.long))
+        self.momentum = 0.1
+    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+        if inverse:
+            return self._inverse_transform(x)
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        if self.training:
+            batch_mean = x.mean(dim=0, keepdim=True)
+            batch_std = x.std(dim=0, keepdim=True)
+            learned_mean_adj = self.mean_estimator(batch_mean)
+            learned_std_adj = self.std_estimator(batch_std)
+            effective_mean = batch_mean + learned_mean_adj
+            effective_std = batch_std + learned_std_adj + 1e-8
+            with torch.no_grad():
+                self.num_batches_tracked += 1
+                if self.num_batches_tracked == 1:
+                    self.running_mean.copy_(batch_mean.squeeze())
+                    self.running_std.copy_(batch_std.squeeze())
+                else:
+                    self.running_mean.mul_(1 - self.momentum).add_(batch_mean.squeeze(), alpha=self.momentum)
+                    self.running_std.mul_(1 - self.momentum).add_(batch_std.squeeze(), alpha=self.momentum)
+        else:
+            effective_mean = self.running_mean.unsqueeze(0)
+            effective_std = self.running_std.unsqueeze(0) + 1e-8
+        normalized = (x - effective_mean) / effective_std
+        transformed = normalized * self.weight + self.bias
+        if len(original_shape) == 3:
+            transformed = transformed.view(original_shape)
+        reg_loss = 0.01 * (self.weight.var() + self.bias.var())
+        return transformed, reg_loss
+    def _inverse_transform(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not self.config.learn_inverse_preprocessing:
+            return x, torch.tensor(0.0, device=x.device)
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        x = (x - self.bias) / (self.weight + 1e-8)
+        effective_mean = self.running_mean.unsqueeze(0)
+        effective_std = self.running_std.unsqueeze(0) + 1e-8
+        x = x * effective_std + effective_mean
+        if len(original_shape) == 3:
+            x = x.view(original_shape)
+        return x, torch.tensor(0.0, device=x.device)
+class LearnableMinMaxScaler(nn.Module):
+    """Learnable MinMax scaler that adapts bounds during training."""
+    def __init__(self, config: AutoencoderConfig):
+        super().__init__()
+        self.config = config
+        input_dim = config.input_dim
+        hidden_dim = config.preprocessing_hidden_dim
+        self.min_estimator = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, input_dim)
+        )
+        self.range_estimator = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, input_dim), nn.Softplus()
+        )
+        self.weight = nn.Parameter(torch.ones(input_dim))
+        self.bias = nn.Parameter(torch.zeros(input_dim))
+        self.register_buffer("running_min", torch.zeros(input_dim))
+        self.register_buffer("running_range", torch.ones(input_dim))
+        self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.long))
+        self.momentum = 0.1
+    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+        if inverse:
+            return self._inverse_transform(x)
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        eps = 1e-8
+        if self.training:
+            batch_min = x.min(dim=0, keepdim=True).values
+            batch_max = x.max(dim=0, keepdim=True).values
+            batch_range = (batch_max - batch_min).clamp_min(eps)
+            learned_min_adj = self.min_estimator(batch_min)
+            learned_range_adj = self.range_estimator(batch_range)
+            effective_min = batch_min + learned_min_adj
+            effective_range = batch_range + learned_range_adj + eps
+            with torch.no_grad():
+                self.num_batches_tracked += 1
+                if self.num_batches_tracked == 1:
+                    self.running_min.copy_(batch_min.squeeze())
+                    self.running_range.copy_(batch_range.squeeze())
+                else:
+                    self.running_min.mul_(1 - self.momentum).add_(batch_min.squeeze(), alpha=self.momentum)
+                    self.running_range.mul_(1 - self.momentum).add_(batch_range.squeeze(), alpha=self.momentum)
+        else:
+            effective_min = self.running_min.unsqueeze(0)
+            effective_range = self.running_range.unsqueeze(0)
+        scaled = (x - effective_min) / effective_range
+        transformed = scaled * self.weight + self.bias
+        if len(original_shape) == 3:
+            transformed = transformed.view(original_shape)
+        reg_loss = 0.01 * (self.weight.var() + self.bias.var())
+        if self.training:
+            reg_loss = reg_loss + 0.001 * (1.0 / effective_range.clamp_min(1e-3)).mean()
+        return transformed, reg_loss
+    def _inverse_transform(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not self.config.learn_inverse_preprocessing:
+            return x, torch.tensor(0.0, device=x.device)
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        x = (x - self.bias) / (self.weight + 1e-8)
+        x = x * self.running_range.unsqueeze(0) + self.running_min.unsqueeze(0)
+        if len(original_shape) == 3:
+            x = x.view(original_shape)
+        return x, torch.tensor(0.0, device=x.device)
+class LearnableRobustScaler(nn.Module):
+    """Learnable Robust scaler using median and IQR with learnable adjustments."""
+    def __init__(self, config: AutoencoderConfig):
+        super().__init__()
+        self.config = config
+        input_dim = config.input_dim
+        hidden_dim = config.preprocessing_hidden_dim
+        self.median_estimator = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, input_dim)
+        )
+        self.iqr_estimator = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, input_dim), nn.Softplus()
+        )
+        self.weight = nn.Parameter(torch.ones(input_dim))
+        self.bias = nn.Parameter(torch.zeros(input_dim))
+        self.register_buffer("running_median", torch.zeros(input_dim))
+        self.register_buffer("running_iqr", torch.ones(input_dim))
+        self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.long))
+        self.momentum = 0.1
+    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+        if inverse:
+            return self._inverse_transform(x)
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        eps = 1e-8
+        if self.training:
+            qs = torch.quantile(x, torch.tensor([0.25, 0.5, 0.75], device=x.device), dim=0)
+            q25, med, q75 = qs[0:1, :], qs[1:2, :], qs[2:3, :]
+            iqr = (q75 - q25).clamp_min(eps)
+            learned_med_adj = self.median_estimator(med)
+            learned_iqr_adj = self.iqr_estimator(iqr)
+            effective_median = med + learned_med_adj
+            effective_iqr = iqr + learned_iqr_adj + eps
+            with torch.no_grad():
+                self.num_batches_tracked += 1
+                if self.num_batches_tracked == 1:
+                    self.running_median.copy_(med.squeeze())
+                    self.running_iqr.copy_(iqr.squeeze())
+                else:
+                    self.running_median.mul_(1 - self.momentum).add_(med.squeeze(), alpha=self.momentum)
+                    self.running_iqr.mul_(1 - self.momentum).add_(iqr.squeeze(), alpha=self.momentum)
+        else:
+            effective_median = self.running_median.unsqueeze(0)
+            effective_iqr = self.running_iqr.unsqueeze(0)
+        normalized = (x - effective_median) / effective_iqr
+        transformed = normalized * self.weight + self.bias
+        if len(original_shape) == 3:
+            transformed = transformed.view(original_shape)
+        reg_loss = 0.01 * (self.weight.var() + self.bias.var())
+        if self.training:
+            reg_loss = reg_loss + 0.001 * (1.0 / effective_iqr.clamp_min(1e-3)).mean()
+        return transformed, reg_loss
+    def _inverse_transform(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not self.config.learn_inverse_preprocessing:
+            return x, torch.tensor(0.0, device=x.device)
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        x = (x - self.bias) / (self.weight + 1e-8)
+        x = x * self.running_iqr.unsqueeze(0) + self.running_median.unsqueeze(0)
+        if len(original_shape) == 3:
+            x = x.view(original_shape)
+        return x, torch.tensor(0.0, device=x.device)
+class LearnableYeoJohnsonPreprocessor(nn.Module):
+    """Learnable Yeo-Johnson power transform with per-feature lambda and affine head."""
+    def __init__(self, config: AutoencoderConfig):
+        super().__init__()
+        self.config = config
+        input_dim = config.input_dim
+        self.lmbda = nn.Parameter(torch.ones(input_dim))
+        self.weight = nn.Parameter(torch.ones(input_dim))
+        self.bias = nn.Parameter(torch.zeros(input_dim))
+        self.register_buffer("running_mean", torch.zeros(input_dim))
+        self.register_buffer("running_std", torch.ones(input_dim))
+        self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.long))
+        self.momentum = 0.1
+    def _yeo_johnson(self, x: torch.Tensor, lmbda: torch.Tensor) -> torch.Tensor:
+        eps = 1e-6
+        lmbda = lmbda.unsqueeze(0)
+        pos = x >= 0
+        if_part = torch.where(torch.abs(lmbda) > eps, ((x + 1.0).clamp_min(eps) ** lmbda - 1.0) / lmbda, torch.log((x + 1.0).clamp_min(eps)))
+        two_minus_lambda = 2.0 - lmbda
+        else_part = torch.where(torch.abs(two_minus_lambda) > eps, -(((1.0 - x).clamp_min(eps)) ** two_minus_lambda - 1.0) / two_minus_lambda, -torch.log((1.0 - x).clamp_min(eps)))
+        return torch.where(pos, if_part, else_part)
+    def _yeo_johnson_inverse(self, y: torch.Tensor, lmbda: torch.Tensor) -> torch.Tensor:
+        eps = 1e-6
+        lmbda = lmbda.unsqueeze(0)
+        pos = y >= 0
+        x_pos = torch.where(torch.abs(lmbda) > eps, (y * lmbda + 1.0).clamp_min(eps) ** (1.0 / lmbda) - 1.0, torch.exp(y) - 1.0)
+        two_minus_lambda = 2.0 - lmbda
+        x_neg = torch.where(torch.abs(two_minus_lambda) > eps, 1.0 - (1.0 - y * two_minus_lambda).clamp_min(eps) ** (1.0 / two_minus_lambda), 1.0 - torch.exp(-y))
+        return torch.where(pos, x_pos, x_neg)
+    def forward(self, x: torch.Tensor, inverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+        if inverse:
+            return self._inverse_transform(x)
+        orig_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        y = self._yeo_johnson(x, self.lmbda)
+        if self.training:
+            batch_mean = y.mean(dim=0, keepdim=True)
+            batch_std = y.std(dim=0, keepdim=True).clamp_min(1e-6)
+            with torch.no_grad():
+                self.num_batches_tracked += 1
+                if self.num_batches_tracked == 1:
+                    self.running_mean.copy_(batch_mean.squeeze())
+                    self.running_std.copy_(batch_std.squeeze())
+                else:
+                    self.running_mean.mul_(1 - self.momentum).add_(batch_mean.squeeze(), alpha=self.momentum)
+                    self.running_std.mul_(1 - self.momentum).add_(batch_std.squeeze(), alpha=self.momentum)
+            mean = batch_mean
+            std = batch_std
+        else:
+            mean = self.running_mean.unsqueeze(0)
+            std = self.running_std.unsqueeze(0)
+        y_norm = (y - mean) / std
+        out = y_norm * self.weight + self.bias
+        if len(orig_shape) == 3:
+            out = out.view(orig_shape)
+        reg = 0.001 * (self.lmbda - 1.0).pow(2).mean() + 0.01 * (self.weight.var() + self.bias.var())
+        return out, reg
+    def _inverse_transform(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not self.config.learn_inverse_preprocessing:
+            return x, torch.tensor(0.0, device=x.device)
+        orig_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        y = (x - self.bias) / (self.weight + 1e-8)
+        y = y * self.running_std.unsqueeze(0) + self.running_mean.unsqueeze(0)
+        out = self._yeo_johnson_inverse(y, self.lmbda)
+        if len(orig_shape) == 3:
+            out = out.view(orig_shape)
+        return out, torch.tensor(0.0, device=x.device)
+class PreprocessingBlock(BaseBlock):
+    """Wraps a LearnablePreprocessor into a BaseBlock-compatible interface.
+    Forward returns the transformed tensor and stores the regularization loss in .reg_loss.
+    The inverse flag is configured at initialization to avoid leaking kwargs to other blocks.
+    """
+    def __init__(self, config: AutoencoderConfig, inverse: bool = False, proc: Optional[LearnablePreprocessor] = None):
+        super().__init__()
+        self.proc = proc if proc is not None else LearnablePreprocessor(config)
+        self._output_dim = config.input_dim
+        self.inverse = inverse
+        self.reg_loss: torch.Tensor = torch.tensor(0.0)
+    @property
+    def output_dim(self) -> int:
+        return self._output_dim
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        y, reg = self.proc(x, inverse=self.inverse)
+        self.reg_loss = reg
+        return y
+class CouplingLayer(nn.Module):
+    """Coupling layer for normalizing flows."""
+    def __init__(self, input_dim: int, hidden_dim: int = 64, mask_type: str = "alternating"):
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        if mask_type == "alternating":
+            self.register_buffer("mask", torch.arange(input_dim) % 2)
+        elif mask_type == "half":
+            mask = torch.zeros(input_dim)
+            mask[: input_dim // 2] = 1
+            self.register_buffer("mask", mask)
+        else:
+            raise ValueError(f"Unknown mask type: {mask_type}")
+        masked_dim = int(self.mask.sum().item())
+        unmasked_dim = input_dim - masked_dim
+        self.scale_net = nn.Sequential(
+            nn.Linear(masked_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, unmasked_dim), nn.Tanh()
+        )
+        self.translate_net = nn.Sequential(
+            nn.Linear(masked_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, unmasked_dim)
+        )
+    def forward(self, x: torch.Tensor, inverse: bool = False):
+        mask = self.mask.bool()
+        x_masked = x[:, mask]
+        x_unmasked = x[:, ~mask]
+        s = self.scale_net(x_masked)
+        t = self.translate_net(x_masked)
+        if not inverse:
+            y_unmasked = x_unmasked * torch.exp(s) + t
+            log_det = s.sum(dim=1)
+        else:
+            y_unmasked = (x_unmasked - t) * torch.exp(-s)
+            log_det = -s.sum(dim=1)
+        y = torch.zeros_like(x)
+        y[:, mask] = x_masked
+        y[:, ~mask] = y_unmasked
+        return y, log_det
+class NormalizingFlowPreprocessor(nn.Module):
+    """Normalizing flow for learnable data preprocessing."""
+    def __init__(self, config: AutoencoderConfig):
+        super().__init__()
+        self.config = config
+        input_dim = config.input_dim
+        hidden_dim = config.preprocessing_hidden_dim
+        num_layers = config.flow_coupling_layers
+        self.layers = nn.ModuleList()
+        for i in range(num_layers):
+            mask_type = "alternating" if i % 2 == 0 else "half"
+            self.layers.append(CouplingLayer(input_dim, hidden_dim, mask_type))
+        if config.use_batch_norm:
+            self.batch_norms = nn.ModuleList([nn.BatchNorm1d(input_dim) for _ in range(num_layers - 1)])
+        else:
+            self.batch_norms = None
+    def forward(self, x: torch.Tensor, inverse: bool = False):
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.size(-1))
+        log_det_total = torch.zeros(x.size(0), device=x.device)
+        if not inverse:
+            for i, layer in enumerate(self.layers):
+                x, log_det = layer(x, inverse=False)
+                log_det_total += log_det
+                if self.batch_norms and i < len(self.layers) - 1:
+                    x = self.batch_norms[i](x)
+        else:
+            for i, layer in enumerate(reversed(self.layers)):
+                if self.batch_norms and i > 0:
+                    bn_idx = len(self.layers) - 1 - i
+                    x = self.batch_norms[bn_idx](x)
+                x, log_det = layer(x, inverse=True)
+                log_det_total += log_det
+        if len(original_shape) == 3:
+            x = x.view(original_shape)
+        reg_loss = 0.01 * log_det_total.abs().mean()
+        return x, reg_loss
+class LearnablePreprocessor(nn.Module):
+    """Unified interface for learnable preprocessing methods."""
+    def __init__(self, config: AutoencoderConfig):
+        super().__init__()
+        self.config = config
+        if not config.has_preprocessing:
+            self.preprocessor = nn.Identity()
+        elif config.is_neural_scaler:
+            self.preprocessor = NeuralScaler(config)
+        elif config.is_normalizing_flow:
+            self.preprocessor = NormalizingFlowPreprocessor(config)
+        elif getattr(config, "is_minmax_scaler", False):
+            self.preprocessor = LearnableMinMaxScaler(config)
+        elif getattr(config, "is_robust_scaler", False):
+            self.preprocessor = LearnableRobustScaler(config)
+        elif getattr(config, "is_yeo_johnson", False):
+            self.preprocessor = LearnableYeoJohnsonPreprocessor(config)
+        else:
+            raise ValueError(f"Unknown preprocessing type: {config.preprocessing_type}")
+    def forward(self, x: torch.Tensor, inverse: bool = False):
+        if isinstance(self.preprocessor, nn.Identity):
+            return x, torch.tensor(0.0, device=x.device)
+        return self.preprocessor(x, inverse=inverse)
+__all__ = [
+    "NeuralScaler",
+    "LearnableMinMaxScaler",
+    "LearnableRobustScaler",
+    "LearnableYeoJohnsonPreprocessor",
+    "CouplingLayer",
+    "NormalizingFlowPreprocessor",
+    "LearnablePreprocessor",
+    "PreprocessingBlock",
+]

template.py ADDED Viewed

	@@ -0,0 +1,382 @@

+"""
+Ready-to-use configuration templates for the block-based Autoencoder.
+These helpers demonstrate how to assemble encoder_blocks and decoder_blocks
+for a variety of architectures using the new block system. Each class extends
+AutoencoderConfig and can be passed directly to AutoencoderModel.
+Example:
+    from modeling_autoencoder import AutoencoderModel
+    from template import ClassicAutoencoderConfig
+    cfg = ClassicAutoencoderConfig(input_dim=784, latent_dim=64)
+    model = AutoencoderModel(cfg)
+"""
+from __future__ import annotations
+from typing import List
+# Support both package-relative and flat import
+try:
+    from .configuration_autoencoder import (
+        AutoencoderConfig,
+    )
+except Exception:  # pragma: no cover
+    from configuration_autoencoder import (
+        AutoencoderConfig,
+    )
+# ------------------------------- Helpers ------------------------------- #
+def _linear_stack(input_dim: int, dims: List[int], activation: str = "relu", normalization: str = "batch", dropout: float = 0.0):
+    """Build a list of Linear block dict configs mapping input_dim -> dims sequentially."""
+    blocks = []
+    prev = input_dim
+    for h in dims:
+        blocks.append({
+            "type": "linear",
+            "input_dim": prev,
+            "output_dim": h,
+            "activation": activation,
+            "normalization": normalization,
+            "dropout_rate": dropout,
+            "use_residual": False,
+        })
+        prev = h
+    return blocks
+def _default_decoder(latent_dim: int, hidden: List[int], out_dim: int, activation: str = "relu", normalization: str = "batch", dropout: float = 0.0):
+    """Linear decoder: latent_dim -> hidden -> out_dim (final layer identity)."""
+    blocks = _linear_stack(latent_dim, hidden + [out_dim], activation, normalization, dropout)
+    if blocks:
+        blocks[-1]["activation"] = "identity"
+        blocks[-1]["normalization"] = "none"
+        blocks[-1]["dropout_rate"] = 0.0
+    return blocks
+# ---------------------------- Class-based templates ---------------------------- #
+class ClassicAutoencoderConfig(AutoencoderConfig):
+    """Classic dense autoencoder using Linear blocks.
+    Example:
+        cfg = ClassicAutoencoderConfig(input_dim=784, latent_dim=64)
+    """
+    def __init__(self, input_dim: int = 784, latent_dim: int = 64, hidden: List[int] = (512, 256, 128), activation: str = "relu", dropout: float = 0.1, use_batch_norm: bool = True, **kwargs):
+        hidden = list(hidden)
+        norm = "batch" if use_batch_norm else "none"
+        enc = _linear_stack(input_dim, hidden, activation, norm, dropout)
+        dec = _default_decoder(latent_dim, list(reversed(hidden)), input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="classic",
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class VariationalAutoencoderConfig(AutoencoderConfig):
+    """Variational autoencoder (MLP). Uses VariationalBlock in the model.
+    Example:
+        cfg = VariationalAutoencoderConfig(input_dim=784, latent_dim=32)
+    """
+    def __init__(self, input_dim: int = 784, latent_dim: int = 32, hidden: List[int] = (512, 256, 128), activation: str = "relu", dropout: float = 0.1, use_batch_norm: bool = True, beta: float = 1.0, **kwargs):
+        hidden = list(hidden)
+        norm = "batch" if use_batch_norm else "none"
+        enc = _linear_stack(input_dim, hidden, activation, norm, dropout)
+        dec = _default_decoder(latent_dim, list(reversed(hidden)), input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="variational",
+            beta=beta,
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class TransformerAutoencoderConfig(AutoencoderConfig):
+    """Transformer-style autoencoder with attention encoder and MLP decoder.
+    Works with (batch, input_dim) or (batch, time, input_dim).
+    Example:
+        cfg = TransformerAutoencoderConfig(input_dim=256, latent_dim=128)
+    """
+    def __init__(self, input_dim: int = 256, latent_dim: int = 128, num_layers: int = 2, num_heads: int = 4, ffn_mult: int = 4, activation: str = "relu", dropout: float = 0.1, use_batch_norm: bool = False, **kwargs):
+        norm = "batch" if use_batch_norm else "none"
+        enc = []
+        enc.append({"type": "linear", "input_dim": input_dim, "output_dim": input_dim, "activation": activation, "normalization": norm, "dropout_rate": dropout})
+        for _ in range(num_layers):
+            enc.append({"type": "attention", "input_dim": input_dim, "num_heads": num_heads, "ffn_dim": ffn_mult * input_dim, "dropout_rate": dropout})
+        enc.append({"type": "linear", "input_dim": input_dim, "output_dim": input_dim, "activation": activation, "normalization": norm, "dropout_rate": dropout})
+        dec = _default_decoder(latent_dim, [input_dim], input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="classic",
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class RecurrentAutoencoderConfig(AutoencoderConfig):
+    """Recurrent encoder (LSTM/GRU/RNN) for sequence data.
+    Expected input: (batch, time, input_dim). Decoder is MLP back to features per step.
+    Example:
+        cfg = RecurrentAutoencoderConfig(input_dim=128, latent_dim=64, rnn_type="lstm")
+    """
+    def __init__(self, input_dim: int = 128, latent_dim: int = 64, rnn_type: str = "lstm", num_layers: int = 2, bidirectional: bool = False, activation: str = "relu", dropout: float = 0.1, use_batch_norm: bool = False, **kwargs):
+        norm = "batch" if use_batch_norm else "none"
+        enc = [{
+            "type": "recurrent",
+            "input_dim": input_dim,
+            "hidden_size": latent_dim,
+            "num_layers": num_layers,
+            "rnn_type": rnn_type,
+            "bidirectional": bidirectional,
+            "dropout_rate": dropout,
+            "output_dim": latent_dim,
+        }]
+        dec = _default_decoder(latent_dim, [max(latent_dim, input_dim)], input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="classic",
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class ConvolutionalAutoencoderConfig(AutoencoderConfig):
+    """1D convolutional encoder for sequence data; decoder is per-step MLP.
+    Expected input: (batch, time, input_dim).
+    Example:
+        cfg = ConvolutionalAutoencoderConfig(input_dim=64, conv_channels=(64, 64))
+    """
+    def __init__(self, input_dim: int = 64, latent_dim: int = 64, conv_channels: List[int] = (64, 64), kernel_size: int = 3, activation: str = "relu", dropout: float = 0.0, use_batch_norm: bool = True, **kwargs):
+        norm = "batch" if use_batch_norm else "none"
+        enc = []
+        prev = input_dim
+        for ch in conv_channels:
+            enc.append({"type": "conv1d", "input_dim": prev, "output_dim": ch, "kernel_size": kernel_size, "padding": "same", "activation": activation, "normalization": norm, "dropout_rate": dropout})
+            prev = ch
+        enc.append({"type": "linear", "input_dim": prev, "output_dim": latent_dim, "activation": activation, "normalization": norm, "dropout_rate": dropout})
+        dec = _default_decoder(latent_dim, [prev], input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="classic",
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class ConvAttentionAutoencoderConfig(AutoencoderConfig):
+    """Mixed Conv + Attention encoder for sequence data.
+    Example:
+        cfg = ConvAttentionAutoencoderConfig(input_dim=64, latent_dim=64)
+    """
+    def __init__(self, input_dim: int = 64, latent_dim: int = 64, conv_channels: List[int] = (64,), num_heads: int = 4, activation: str = "relu", dropout: float = 0.1, use_batch_norm: bool = True, **kwargs):
+        norm = "batch" if use_batch_norm else "none"
+        enc = []
+        prev = input_dim
+        for ch in conv_channels:
+            enc.append({"type": "conv1d", "input_dim": prev, "output_dim": ch, "kernel_size": 3, "padding": "same", "activation": activation, "normalization": norm, "dropout_rate": dropout})
+            prev = ch
+        enc.append({"type": "attention", "input_dim": prev, "num_heads": num_heads, "ffn_dim": 4 * prev, "dropout_rate": dropout})
+        enc.append({"type": "linear", "input_dim": prev, "output_dim": latent_dim, "activation": activation, "normalization": norm, "dropout_rate": dropout})
+        dec = _default_decoder(latent_dim, [prev], input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="classic",
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class LinearRecurrentAutoencoderConfig(AutoencoderConfig):
+    """Linear down-projection then Recurrent encoder.
+    Example:
+        cfg = LinearRecurrentAutoencoderConfig(input_dim=256, latent_dim=64, rnn_type="gru")
+    """
+    def __init__(self, input_dim: int = 256, latent_dim: int = 64, rnn_type: str = "gru", activation: str = "relu", dropout: float = 0.1, use_batch_norm: bool = False, **kwargs):
+        norm = "batch" if use_batch_norm else "none"
+        enc = [
+            {"type": "linear", "input_dim": input_dim, "output_dim": latent_dim, "activation": activation, "normalization": norm, "dropout_rate": dropout},
+            {"type": "recurrent", "input_dim": latent_dim, "hidden_size": latent_dim, "num_layers": 1, "rnn_type": rnn_type, "bidirectional": False, "dropout_rate": dropout, "output_dim": latent_dim},
+        ]
+        dec = _default_decoder(latent_dim, [], input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="classic",
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class PreprocessedAutoencoderConfig(AutoencoderConfig):
+    """Classic MLP AE with learnable preprocessing/inverse.
+    Example:
+        cfg = PreprocessedAutoencoderConfig(input_dim=64, preprocessing_type="neural_scaler")
+    """
+    def __init__(self, input_dim: int = 64, latent_dim: int = 32, preprocessing_type: str = "neural_scaler", hidden: List[int] = (128, 64), activation: str = "relu", dropout: float = 0.0, use_batch_norm: bool = True, **kwargs):
+        norm = "batch" if use_batch_norm else "none"
+        enc = _linear_stack(input_dim, list(hidden), activation, norm, dropout)
+        dec = _default_decoder(latent_dim, list(reversed(list(hidden))), input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="classic",
+            use_learnable_preprocessing=True,
+            preprocessing_type=preprocessing_type,
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class BetaVariationalAutoencoderConfig(AutoencoderConfig):
+    """Beta-VAE (MLP). Like VAE but with beta > 1 controlling KL weight.
+    Example:
+        cfg = BetaVariationalAutoencoderConfig(input_dim=784, latent_dim=32, beta=4.0)
+    """
+    def __init__(self, input_dim: int = 784, latent_dim: int = 32, hidden: List[int] = (512, 256, 128), activation: str = "relu", dropout: float = 0.1, use_batch_norm: bool = True, beta: float = 4.0, **kwargs):
+        hidden = list(hidden)
+        norm = "batch" if use_batch_norm else "none"
+        enc = _linear_stack(input_dim, hidden, activation, norm, dropout)
+        dec = _default_decoder(latent_dim, list(reversed(hidden)), input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="beta_vae",
+            beta=beta,
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class DenoisingAutoencoderConfig(AutoencoderConfig):
+    """Denoising AE: adds noise during training (handled by training loop/model if supported).
+    Example:
+        cfg = DenoisingAutoencoderConfig(input_dim=128, latent_dim=32, noise_factor=0.2)
+    """
+    def __init__(self, input_dim: int = 128, latent_dim: int = 32, hidden: List[int] = (128, 64), activation: str = "relu", dropout: float = 0.0, use_batch_norm: bool = True, noise_factor: float = 0.2, **kwargs):
+        hidden = list(hidden)
+        norm = "batch" if use_batch_norm else "none"
+        enc = _linear_stack(input_dim, hidden, activation, norm, dropout)
+        dec = _default_decoder(latent_dim, list(reversed(hidden)), input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="denoising",
+            noise_factor=noise_factor,
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class SparseAutoencoderConfig(AutoencoderConfig):
+    """Sparse AE (typical L1 activation penalty applied in training loop).
+    Example:
+        cfg = SparseAutoencoderConfig(input_dim=256, latent_dim=64)
+    """
+    def __init__(self, input_dim: int = 256, latent_dim: int = 64, hidden: List[int] = (128, 64), activation: str = "relu", dropout: float = 0.0, use_batch_norm: bool = True, **kwargs):
+        hidden = list(hidden)
+        norm = "batch" if use_batch_norm else "none"
+        enc = _linear_stack(input_dim, hidden, activation, norm, dropout)
+        dec = _default_decoder(latent_dim, list(reversed(hidden)), input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="sparse",
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+class ContractiveAutoencoderConfig(AutoencoderConfig):
+    """Contractive AE (requires Jacobian penalty in training loop).
+    Example:
+        cfg = ContractiveAutoencoderConfig(input_dim=64, latent_dim=16)
+    """
+    def __init__(self, input_dim: int = 64, latent_dim: int = 16, hidden: List[int] = (64, 32), activation: str = "relu", dropout: float = 0.0, use_batch_norm: bool = True, **kwargs):
+        hidden = list(hidden)
+        norm = "batch" if use_batch_norm else "none"
+        enc = _linear_stack(input_dim, hidden, activation, norm, dropout)
+        dec = _default_decoder(latent_dim, list(reversed(hidden)), input_dim, activation, norm, dropout)
+        super().__init__(
+            input_dim=input_dim,
+            latent_dim=latent_dim,
+            activation=activation,
+            dropout_rate=dropout,
+            use_batch_norm=use_batch_norm,
+            autoencoder_type="contractive",
+            encoder_blocks=enc,
+            decoder_blocks=dec,
+            **kwargs,
+        )
+__all__ = [
+    "ClassicAutoencoderConfig",
+    "VariationalAutoencoderConfig",
+    "TransformerAutoencoderConfig",
+    "RecurrentAutoencoderConfig",
+    "ConvolutionalAutoencoderConfig",
+    "ConvAttentionAutoencoderConfig",
+    "LinearRecurrentAutoencoderConfig",
+    "PreprocessedAutoencoderConfig",
+    "BetaVariationalAutoencoderConfig",
+    "DenoisingAutoencoderConfig",
+    "SparseAutoencoderConfig",
+    "ContractiveAutoencoderConfig",
+]

utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ---------------------------- Utilities ---------------------------- #
+def _get_activation(name: Optional[str]) -> nn.Module:
+    if name is None:
+        return nn.Identity()
+    name = name.lower()
+    mapping = {
+        "relu": nn.ReLU(),
+        "gelu": nn.GELU(),
+        "silu": nn.SiLU(),
+        "swish": nn.SiLU(),
+        "tanh": nn.Tanh(),
+        "sigmoid": nn.Sigmoid(),
+        "leaky_relu": nn.LeakyReLU(0.2),
+        "elu": nn.ELU(),
+        "mish": nn.Mish(),
+        "softplus": nn.Softplus(),
+        "identity": nn.Identity(),
+        None: nn.Identity(),
+    }
+    if name not in mapping:
+        raise ValueError(f"Unknown activation: {name}")
+    return mapping[name]
+def _get_norm(name: Optional[str], num_features: int) -> nn.Module:
+    if name is None or name == "none":
+        return nn.Identity()
+    name = name.lower()
+    if name == "batch":
+        return nn.BatchNorm1d(num_features)
+    if name == "layer":
+        return nn.LayerNorm(num_features)
+    if name == "instance":
+        return nn.InstanceNorm1d(num_features)
+    if name == "group":
+        # default 8 groups or min that divides
+        groups = max(1, min(8, num_features))
+        # ensure divisible
+        while num_features % groups != 0 and groups > 1:
+            groups -= 1
+        if groups == 1:
+            return nn.LayerNorm(num_features)
+        return nn.GroupNorm(groups, num_features)
+    raise ValueError(f"Unknown normalization: {name}")
+def _flatten_3d_to_2d(x: torch.Tensor) -> Tuple[torch.Tensor, Optional[Tuple[int, int]]]:
+    if x.dim() == 3:
+        b, t, f = x.shape
+        return x.reshape(b * t, f), (b, t)
+    return x, None
+def _maybe_restore_3d(x: torch.Tensor, shape_hint: Optional[Tuple[int, int]]) -> torch.Tensor:
+    if shape_hint is None:
+        return x
+    b, t = shape_hint
+    f = x.shape[-1]
+    return x.reshape(b, t, f)