base_model: conceptofmind/LLongMA-2-7b-16k base_model_config: conceptofmind/LLongMA-2-7b-16k model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer tokenizer_use_fast: true tokenizer_legacy: true load_in_8bit: false load_in_4bit: false strict: false push_dataset_to_hub: hf_use_auth_token: datasets: - path: openaccess-ai-collective/oo-gpt4-filtered type: alpaca_w_system.load_open_orca_chatml data_files: - 1M-GPT4-Augmented-filtered-gt10.parquet dataset_prepared_path: last_run_prepared val_set_size: 0.01 adapter: lora_model_dir: sequence_len: 16384 max_packed_sequence_len: sample_packing: true sample_packing_eff_est: 0.99 sample_packing_seq_len_multiplier: 2 total_num_tokens: 372602546 lora_r: lora_alpha: lora_dropout: lora_target_modules: lora_target_linear: lora_fan_in_fan_out: wandb_project: open-long-orca-7b wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./open-long-orca-7b gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_torch adam_beta2: 0.95 adam_eps: 0.00001 max_grad_norm: 1.0 torchdistx_path: lr_scheduler: cosine lr_quadratic_warmup: true learning_rate: 0.000017 train_on_inputs: false group_by_length: false bf16: true fp16: false tf32: true gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true sdp_attention: flash_optimum: gptq_groupsize: gptq_model_v1: warmup_steps: 32 eval_steps: 284 save_steps: debug: deepspeed: weight_decay: 0.1 special_tokens: bos_token: "" eos_token: "" unk_token: "" tokens: - "<|im_start|>" - "<|im_end|>" fsdp: - full_shard - auto_wrap fsdp_config: fsdp_sync_module_states: true fsdp_offload_params: true fsdp_state_dict_type: FULL_STATE_DICT fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer