| { |
| "model": { |
| "d_model": 128, |
| "n_layers": 4, |
| "n_heads": 4, |
| "d_ff": 512, |
| "dropout": 0.1, |
| "activation": "gelu", |
| "max_seq_len": 64, |
| "vocab_size": 2000, |
| "pos_encoding_type": "rotary", |
| "use_flash_attention": true, |
| "norm_type": "rmsnorm", |
| "norm_eps": 1e-06, |
| "init_std": 0.02 |
| }, |
| "diffusion": { |
| "n_timesteps": 200, |
| "n_inference_steps": 20, |
| "schedule_type": "cosine", |
| "beta_start": 0.0001, |
| "beta_end": 0.02, |
| "prediction_type": "epsilon", |
| "sampling_method": "ddim", |
| "eta_ddim": 0.0, |
| "clip_sample_max": 5.0, |
| "clip_sample_min": -5.0, |
| "loss_type": "mse", |
| "loss_weighting": "min_snr", |
| "p2_gamma": 1.0, |
| "p2_k": 1.0 |
| }, |
| "graph_encoder": { |
| "d_graph": 128, |
| "n_graph_layers": 2, |
| "n_graph_heads": 4, |
| "max_evidence_nodes": 50, |
| "max_compositions": 20, |
| "max_anomalies": 10, |
| "max_reasoning_steps": 15, |
| "conditioning_method": "cross_attention", |
| "embed_confidence": true, |
| "embed_temporal": true |
| }, |
| "tokenizer": { |
| "bpe_vocab_size": 28000, |
| "max_sentences": 32, |
| "sentence_boundary_token": "<sent>", |
| "pad_token": "<pad>", |
| "bos_token": "<bos>", |
| "eos_token": "<eos>", |
| "mask_token": "<mask>", |
| "noise_token": "<noise>", |
| "evidence_token": "<evidence>", |
| "anomaly_token": "<anomaly>", |
| "confidence_token": "<confidence>", |
| "reasoning_token": "<reasoning>", |
| "composition_token": "<composition>", |
| "temporal_token": "<temporal>", |
| "min_frequency": 2, |
| "dropout_rate": 0.0 |
| }, |
| "training": { |
| "learning_rate": 0.0001, |
| "weight_decay": 0.01, |
| "adam_beta1": 0.9, |
| "adam_beta2": 0.999, |
| "adam_eps": 1e-08, |
| "lr_schedule": "cosine", |
| "warmup_steps": 2000, |
| "batch_size": 32, |
| "gradient_accumulation_steps": 4, |
| "max_steps": 500000, |
| "max_epochs": 100, |
| "dropout": 0.1, |
| "grad_clip_norm": 1.0, |
| "use_amp": true, |
| "amp_dtype": "bf16", |
| "save_every_steps": 5000, |
| "eval_every_steps": 1000, |
| "keep_last_n_checkpoints": 3, |
| "use_ema": true, |
| "ema_decay": 0.9999, |
| "train_data_path": "", |
| "val_data_path": "", |
| "num_workers": 4, |
| "log_every_steps": 100, |
| "wandb_project": "aam-diffusion-llm", |
| "wandb_run_name": "" |
| }, |
| "inference": { |
| "n_steps": 50, |
| "temperature": 1.0, |
| "top_k": 50, |
| "top_p": 0.95, |
| "repetition_penalty": 1.2, |
| "max_output_sentences": 16, |
| "language": "id" |
| }, |
| "anchored_decoder": { |
| "d_model": 128, |
| "d_vocab": 2000, |
| "n_refine_steps": 3, |
| "d_refine": 64, |
| "use_evoformer_feedback": true, |
| "n_feedback_iterations": 2, |
| "disambiguation_heads": 8 |
| }, |
| "flow_matching": { |
| "d_model": 128, |
| "d_vocab": 2000, |
| "num_steps": 3 |
| }, |
| "evoformer": { |
| "d_model": 128, |
| "n_recycling_steps": 3, |
| "dropout": 0.0, |
| "use_layer_recycling": true, |
| "use_token_recycling": true, |
| "use_decoder_feedback": true, |
| "use_prediction_recycling": true, |
| "min_recycling_improvement": 0.0001 |
| }, |
| "dual_memory": { |
| "d_model": 128, |
| "working_memory_size": 512, |
| "long_term_memory_dim": 64, |
| "consolidation_method": "attention", |
| "retrieval_method": "attention", |
| "n_retrieval_heads": 4, |
| "dropout": 0.0 |
| }, |
| "mcts": { |
| "num_simulations": 4, |
| "c_puct": 1.5, |
| "temperature": 1.0, |
| "max_depth": 10, |
| "use_value_network": true, |
| "max_children": 8 |
| }, |
| "thinking_toggle": { |
| "d_model": 128, |
| "threshold": 0.5 |
| }, |
| "matryoshka": { |
| "d_model": 768, |
| "d_ff": 3072, |
| "granularity_factors": [ |
| 0.25, |
| 0.5, |
| 0.75, |
| 1.0 |
| ], |
| "matryoshka_loss_weight": 0.1, |
| "use_adaptive": true |
| }, |
| "use_anchored_decoder": true, |
| "use_flow_matching": true, |
| "use_evoformer": true, |
| "use_dual_memory": true, |
| "use_mcts": true, |
| "use_thinking_toggle": true, |
| "use_matryoshka": true, |
| "use_swiglu_ffn": true, |
| "model_name": "aam-diffusion-v2.0", |
| "output_dir": "./output", |
| "seed": 42, |
| "aam_mind_source": "rsvs_graph", |
| "aam_body_type": "specialized_diffusion" |
| } |