hdlm-group
/

hdlm-base-gamma-0.01

+ngpus: 4
+gradient_accumulation_steps: 8
+pretrain_autoregressive_path: /home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth
+tokenizer:
+  tokens: 50257
+  model: gpt2
+training:
+  batch_size: 512
+  accum: ${gradient_accumulation_steps}
+  n_iters: 1000000
+  snapshot_freq: 100
+  log_freq: 10
+  eval_freq: 100
+  snapshot_freq_for_preemption: 3000
+  weight: standard
+  snapshot_sampling: true
+  ema: 0.9999
+  warmup_iter: -1
+data:
+  train: openwebtext-train
+  valid: wikitext103
+  cache_dir: /home/toolkit/research-diffcodegen/data
+  debug: false
+graph:
+  type: QGamma
+  gamma: 0.01
+  file: /home/toolkit/research-diffcodegen/data
+  report_all: false
+  expanded_sigma: true
+noise:
+  type: loglinear
+  sigma_min: 0.0001
+  sigma_max: 2.0
+  ar_diffusion: false
+  expanded_sigma: ${graph.expanded_sigma}
+sampling:
+  predictor: analytic
+  steps_per_level: 1
+  noise_removal: true
+  strategy: direct
+  strategy_param: 0.9
+annealing:
+  type: block
+  efficient: false
+  width: 1024
+  tau: 2048
+  eval_tau: 512
+  steps_per_level: ${sampling.steps_per_level}
+  sampling_method: SAR
+  diffusion_loss_weight: 1.0
+  ce_loss_weight: 4.0
+  sampling_eps: 0.0001
+  attention:
+    context_type: block_causal
+    block_type: full
+  match_inference: true
+eval:
+  batch_size: 32
+  perplexity: true
+  perplexity_batch_size: 16
+optim:
+  weight_decay: 0.0
+  optimizer: AdamW
+  lr: 0.0003
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1.0e-08
+  warmup: 10000
+  grad_clip: 1.0
+  scheduler: lambda
+experiment:
+  name: QGamma0.01-v2
+  wandb_project: debug-QGamma
+model:
+  name: gamma_hdlm
+  type: ddit
+  hidden_size: 768
+  cond_dim: 128
+  length: 1024
+  n_blocks: 12
+  n_heads: 12
+  scale_by_sigma: false
+  dropout: 0.1
+  transformer_sigma_conditioning: true
+  hybrid_sigma_embedding: true
+  post_process_logits: true
+  use_timestep_embedding: true
+model_type: gamma_hybrid