model_class: NDT1


encoder:

  from_pt: null
  stitching: false

  masker:
    force_active: true         
    mode: temporal             
    ratio: 0.3                 # ratio of data to predict
    zero_ratio: 1.0            # of the data to predict, ratio of zeroed out
    random_ratio: 1.0          # of the not zeroed, ratio of randomly replaced
    expand_prob: 0.0           # probability of expanding the mask in ``temporal`` mode
    max_timespan: 1            # max span of mask if expanded
    channels: null             # neurons to mask in "co-smoothing" mode
    timesteps: null            # time steps to mask in ``forward-pred`` mode
    mask_regions: ['all']      # brain regions to mask in ``inter-region`` mode
    target_regions: ['all']    # brain regions to predict in ``intra-region`` mode
    n_mask_regions: 1          # num of regions to choose from the list of mask_regions or target_regions
    
  # context available for each timestep
  context:
    forward: -1
    backward: -1

  norm_and_noise:
    active: false
    smooth_sd: 2                # gaussian smoohing  
    norm: "zscore"              # which normalization layer to use (null/layernorm/scalenorm/zscore)
    eps: 1.e-7                  # avoid dividing by zero when normalizing padded spikes     
    white_noise_sd: 1.0         # gaussian noise added to the inputs  1.0 originally
    constant_offset_sd: 0.2     # gaussian noise added to the inputs but contsnat in the time dimension 0.2 originally
    
  embedder:
    n_channels: 668       # number of neurons recorded 
    n_blocks: 24          # number of blocks of experiments
    n_dates: 24           # number of days of experiments
    max_F: 100            # max feature len in timesteps

    mode: linear          # linear/embed/identity
    mult: 2               # embedding multiplier. hiddden_sizd = n_channels * mult
    adapt: false          # adapt the embedding layer for each day
    pos: true             # embed position 
    act: softsign         # activation for the embedding layers
    scale: 1              # scale the embedding multiplying by this number
    bias: true            # use bias in the embedding layer
    dropout: 0.2          # dropout in embedding layer
    
    fixup_init: false     # modify weight initialization
    init_range: 0.1       # initialization range for embeddings
    spike_log_init: false # special initialization 
    max_spikes: 0         # max number of spikes in a single time bin

    tokenize_binary_mask: false
    use_prompt: false
    use_session: false

    stack:
      active: false       # wether to stack consecutive timesteps
      size: 32            # number of consecutive timesteps to stack
      stride: 4           # stacking stride


  transformer:
    n_layers: 5           # number of transformer layers
    hidden_size: 512      # hidden space of the transformer
    use_scalenorm: false  # use scalenorm  instead of layernorm
    use_rope: false       # use rotary postional encoding
    rope_theta: 10000.0   # rope angle of rotation


    n_heads: 8            # number of attentiomn heads
    attention_bias: true  # learn bias in the attention layers

    act: gelu             # activiation function in mlp layers
    inter_size: 1024      # intermediate dimension in the mlp layers
    mlp_bias: true        # learn bias in the mlp layers
    
    dropout: 0.4          # dropout in transformer layers
    fixup_init: true      # modify weight initialization


  factors:  
    active: false             # project from hidden_size to factors
    size: 8                   # factors size  
    act: relu                 # activation function after projecting to factors
    bias: true                # use bias in projection to factors
    dropout: 0.0              # dropout in projection to factors
    fixup_init: false         # modify weight initialization
    init_range: 0.1           # initialization range for factors projetion
    
decoder:
  from_pt: null