wandb_version: 1 _wandb: desc: null value: python_version: 3.9.15 cli_version: 0.15.3 framework: huggingface huggingface_version: 4.30.0.dev0 is_jupyter_run: false is_kaggle_kernel: false start_time: 1685551775.453024 t: 1: - 1 - 5 - 11 - 49 - 51 - 53 - 55 - 71 2: - 1 - 5 - 11 - 49 - 51 - 53 - 55 - 71 3: - 7 - 23 4: 3.9.15 5: 0.15.3 6: 4.30.0.dev0 8: - 5 m: - 1: train/global_step 6: - 3 vocab_size: desc: null value: 32001 max_position_embeddings: desc: null value: 2048 hidden_size: desc: null value: 3200 intermediate_size: desc: null value: 8640 num_hidden_layers: desc: null value: 26 num_attention_heads: desc: null value: 32 hidden_act: desc: null value: silu initializer_range: desc: null value: 0.02 rms_norm_eps: desc: null value: 1.0e-06 use_cache: desc: null value: false return_dict: desc: null value: true output_hidden_states: desc: null value: false output_attentions: desc: null value: false torchscript: desc: null value: false torch_dtype: desc: null value: float16 use_bfloat16: desc: null value: false tf_legacy_loss: desc: null value: false pruned_heads: desc: null value: {} tie_word_embeddings: desc: null value: false is_encoder_decoder: desc: null value: false is_decoder: desc: null value: false cross_attention_hidden_size: desc: null value: null add_cross_attention: desc: null value: false tie_encoder_decoder: desc: null value: false max_length: desc: null value: 20 min_length: desc: null value: 0 do_sample: desc: null value: false early_stopping: desc: null value: false num_beams: desc: null value: 1 num_beam_groups: desc: null value: 1 diversity_penalty: desc: null value: 0.0 temperature: desc: null value: 1.0 top_k: desc: null value: 50 top_p: desc: null value: 1.0 typical_p: desc: null value: 1.0 repetition_penalty: desc: null value: 1.0 length_penalty: desc: null value: 1.0 no_repeat_ngram_size: desc: null value: 0 encoder_no_repeat_ngram_size: desc: null value: 0 bad_words_ids: desc: null value: null num_return_sequences: desc: null value: 1 chunk_size_feed_forward: desc: null value: 0 output_scores: desc: null value: false return_dict_in_generate: desc: null value: false forced_bos_token_id: desc: null value: null forced_eos_token_id: desc: null value: null remove_invalid_values: desc: null value: false exponential_decay_length_penalty: desc: null value: null suppress_tokens: desc: null value: null begin_suppress_tokens: desc: null value: null architectures: desc: null value: - LlamaForCausalLM finetuning_task: desc: null value: null id2label: desc: null value: '0': LABEL_0 '1': LABEL_1 label2id: desc: null value: LABEL_0: 0 LABEL_1: 1 tokenizer_class: desc: null value: null prefix: desc: null value: null bos_token_id: desc: null value: 1 pad_token_id: desc: null value: 0 eos_token_id: desc: null value: 2 sep_token_id: desc: null value: null decoder_start_token_id: desc: null value: null task_specific_params: desc: null value: null problem_type: desc: null value: null _name_or_path: desc: null value: openlm-research/open_llama_3b_600bt_preview transformers_version: desc: null value: 4.30.0.dev0 model_type: desc: null value: llama quantization_config: desc: null value: load_in_8bit: false load_in_4bit: true llm_int8_threshold: 6.0 llm_int8_skip_modules: null llm_int8_enable_fp32_cpu_offload: false llm_int8_has_fp16_weight: false bnb_4bit_quant_type: nf4 bnb_4bit_use_double_quant: true bnb_4bit_compute_dtype: bfloat16 output_dir: desc: null value: ./dolly-lora-3b overwrite_output_dir: desc: null value: false do_train: desc: null value: false do_eval: desc: null value: true do_predict: desc: null value: false evaluation_strategy: desc: null value: steps prediction_loss_only: desc: null value: false per_device_train_batch_size: desc: null value: 4 per_device_eval_batch_size: desc: null value: 8 per_gpu_train_batch_size: desc: null value: None per_gpu_eval_batch_size: desc: null value: None gradient_accumulation_steps: desc: null value: 32 eval_accumulation_steps: desc: null value: None eval_delay: desc: null value: 0 learning_rate: desc: null value: 0.0003 weight_decay: desc: null value: 0.0 adam_beta1: desc: null value: 0.9 adam_beta2: desc: null value: 0.999 adam_epsilon: desc: null value: 1.0e-08 max_grad_norm: desc: null value: 1.0 num_train_epochs: desc: null value: 3 max_steps: desc: null value: -1 lr_scheduler_type: desc: null value: linear warmup_ratio: desc: null value: 0.0 warmup_steps: desc: null value: 100 log_level: desc: null value: passive log_level_replica: desc: null value: warning log_on_each_node: desc: null value: true logging_dir: desc: null value: ./dolly-lora-3b/runs/May31_16-49-31_w-khoic-qlora-098760734b88449facec48d4db511ae3-54fc475665-pz5rx logging_strategy: desc: null value: steps logging_first_step: desc: null value: false logging_steps: desc: null value: 10 logging_nan_inf_filter: desc: null value: true save_strategy: desc: null value: steps save_steps: desc: null value: 100 save_total_limit: desc: null value: 3 save_safetensors: desc: null value: false save_on_each_node: desc: null value: false no_cuda: desc: null value: false use_mps_device: desc: null value: false seed: desc: null value: 42 data_seed: desc: null value: None jit_mode_eval: desc: null value: false use_ipex: desc: null value: false bf16: desc: null value: false fp16: desc: null value: false fp16_opt_level: desc: null value: O1 half_precision_backend: desc: null value: auto bf16_full_eval: desc: null value: false fp16_full_eval: desc: null value: false tf32: desc: null value: None local_rank: desc: null value: 0 ddp_backend: desc: null value: None tpu_num_cores: desc: null value: None tpu_metrics_debug: desc: null value: false debug: desc: null value: '[]' dataloader_drop_last: desc: null value: false eval_steps: desc: null value: 100 dataloader_num_workers: desc: null value: 0 past_index: desc: null value: -1 run_name: desc: null value: ./dolly-lora-3b disable_tqdm: desc: null value: false remove_unused_columns: desc: null value: true label_names: desc: null value: None load_best_model_at_end: desc: null value: false metric_for_best_model: desc: null value: None greater_is_better: desc: null value: None ignore_data_skip: desc: null value: false sharded_ddp: desc: null value: '[]' fsdp: desc: null value: '[]' fsdp_min_num_params: desc: null value: 0 fsdp_config: desc: null value: '{''fsdp_min_num_params'': 0, ''xla'': False, ''xla_fsdp_grad_ckpt'': False}' fsdp_transformer_layer_cls_to_wrap: desc: null value: None deepspeed: desc: null value: None label_smoothing_factor: desc: null value: 0.0 optim: desc: null value: paged_adamw_8bit optim_args: desc: null value: None adafactor: desc: null value: false group_by_length: desc: null value: true length_column_name: desc: null value: length report_to: desc: null value: '[''tensorboard'', ''wandb'']' ddp_find_unused_parameters: desc: null value: None ddp_bucket_cap_mb: desc: null value: None dataloader_pin_memory: desc: null value: true skip_memory_metrics: desc: null value: true use_legacy_prediction_loop: desc: null value: false push_to_hub: desc: null value: false resume_from_checkpoint: desc: null value: None hub_model_id: desc: null value: None hub_strategy: desc: null value: every_save hub_token: desc: null value: hub_private_repo: desc: null value: false gradient_checkpointing: desc: null value: false include_inputs_for_metrics: desc: null value: false fp16_backend: desc: null value: auto push_to_hub_model_id: desc: null value: None push_to_hub_organization: desc: null value: None push_to_hub_token: desc: null value: mp_parameters: desc: null value: '' auto_find_batch_size: desc: null value: false full_determinism: desc: null value: false torchdynamo: desc: null value: None ray_scope: desc: null value: last ddp_timeout: desc: null value: 1800 torch_compile: desc: null value: false torch_compile_backend: desc: null value: None torch_compile_mode: desc: null value: None xpu_backend: desc: null value: None train_batch_size: desc: null value: 4 eval_batch_size: desc: null value: 8