{ "ngpus": 8, "tokens": 50257, "training": { "batch_size": 512, "accum": 2, "n_iters": 1300001, "snapshot_freq": 50000, "log_freq": 50, "eval_freq": 100, "snapshot_freq_for_preemption": 10000, "weight": "standard", "snapshot_sampling": true, "ema": 0.9999 }, "data": { "train": "openwebtext", "valid": "wikitext103", "cache_dir": "data" }, "graph": { "type": "absorb" }, "noise": { "type": "loglinear", "sigma_min": 0.0001, "sigma_max": 20 }, "sampling": { "predictor": "euler", "steps": 128, "noise_removal": true }, "eval": { "batch_size": 512, "perplexity": true, "perplexity_batch_size": 32 }, "optim": { "weight_decay": 0, "optimizer": "AdamW", "lr": 0.0003, "beta1": 0.9, "beta2": 0.999, "eps": 1e-08, "warmup": 2500, "grad_clip": 1.0 }, "model": { "name": "medium", "type": "ddit", "hidden_size": 1024, "cond_dim": 128, "length": 1024, "n_blocks": 24, "n_heads": 16, "scale_by_sigma": true, "dropout": 0.1 }, "work_dir": "absorb_medium" }