# culturax-ar-spbpe32k-focus-embs-anneal-bf16-mixed-xassyy15 W&B run url: https://wandb.ai/konstantinjdobler/tv/runs/xassyy15 W&B run ID: xassyy15 ## Metadata ```json { "data_dir": "/raid/konstantin.dobler/culturax/ar/ar/tokenized/tokenizers_ar_sp-bpe-ar-32kauto", "model_path": "/raid/konstantin.dobler/checkpoints/culturax-ar-spbpe32k-smart-heuristics-attn-fix-infini-just-embs/ruxr78xn/step-0000100-ckpt", "from_scratch": false, "saved_checkpoint_path": null, "resume": false, "train_file": "train.txt", "val_file": "val.txt", "tokenizer_path": "/raid/konstantin.dobler/checkpoints/culturax-ar-spbpe32k-smart-heuristics-attn-fix-infini-just-embs/ruxr78xn/step-0000100-ckpt", "base_unit": "optimizer-steps", "training_goal": 7680, "eval_interval": 384, "eval_samples": 10000, "save_interval": 768, "log_interval": 1, "model_profiling_interval": 10, "warmup_period": 76, "lr_decay_period": 4608, "lr_final_annealing_period": 1075, "block_size": 4096, "decontaminated_packing": true, "max_lr": 3e-05, "batch_size": 256, "weight_decay": 0.05, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "min_lr": 2e-06, "infinite_lr": 1.5e-05, "accelerator": "cuda", "num_devices": 4, "activation_checkpointing": false, "distributed_strategy": "auto", "use_fsdp": true, "fsdp_sharding_strategy": "SHARD_GRAD_OP", "fsdp_limit_all_gathers": false, "fsdp_cpu_offload": false, "fsdp_ram_friendly_checkpointing": false, "fsdp_backward_prefetch_post": false, "smart_cuda_alloc": false, "fast_model_loading": true, "micro_batch_size": 1, "eval_micro_batch_size": 10, "gradient_accumulation_steps": 64, "precision": "bf16-true", "use_anyprecision_adamw": false, "adamw_foreach": true, "compile": false, "use_additional_flash_attn_kernels": true, "workers": 8, "preprocessing_workers": 224, "run_name": "culturax-ar-spbpe32k-focus+justembs-attn-fix-infini", "seed": 42, "only_val": false, "val_before_training": true, "out_dir": "/raid/konstantin.dobler/checkpoints/culturax-ar-spbpe32k-focus+justembs-attn-fix-infini", "wandb_tags": [], "offline": false, "debug": false, "model_profiling": true, "force_deterministic": false, "fast_dev_run": false, "cross_tokenizer_val": false, "optimized_activation_checkpointing_policy": false, "train_embeddings": false, "train_only_embeddings": false, "focus_init": false, "refocus_init": false, "mean_init": false, "random_init": false, "zipf_init": false, "smart_heuristic_init": false, "wechsel_init": false, "deepfocus_init": false, "zett_init": false, "focus_fasttext_dim": 300, "focus_fasttext_epochs": 3, "focus_fasttext_min_count": 10, "focus_auxiliary_mode": "fasttext-tokenlevel", "focus_fasttext_model_path": null, "focus_exact_match_all": false, "focus_match_symbols": false, "focus_bilingual_dict": null, "focus_bilingual_dict_mode": "mean", "focus_fuzzy_match_all": false, "focus_random_init_source": "source", "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "lora_query": false, "lora_key": false, "lora_value": false, "lora_projection": false, "lora_mlp": false, "lora_head": false, "perf_benchmark": false } ```