checkpoints: null data: null general: benchmark_csv_path: null consumed_train_samples: null ignore_sanity_checks: false project: mistralai run: Mistral-7B-v0.1 seed: 42 step: 0 logging: null model: ddp_bucket_cap_mb: 25 dtype: bfloat16 init_method: std: 0.025 make_vocab_size_divisible_by: 1 model_config: attn_pdrop: 0.0 bos_token_id: 1 eos_token_id: 2 hidden_act: silu hidden_size: 4096 initializer_range: 0.02 intermediate_size: 14336 is_mistral_config: true max_position_embeddings: 32768 num_attention_heads: 32 num_hidden_layers: 32 num_key_value_heads: 8 pad_token_id: null pretraining_tp: 1 rms_norm_eps: 1.0e-05 rope_theta: 10000.0 sliding_window_size: 4096 tie_word_embeddings: false use_cache: true vocab_size: 32000 optimizer: null parallelism: dp: 2 pp: 2 pp_engine: 1f1b recompute_granularity: SELECTIVE tp: 2 tp_linear_async_communication: true tp_mode: REDUCE_SCATTER profiler: null tokenizer: tokenizer_max_length: null tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 tokenizer_revision: null tokens: null