wandb_version: 1 _wandb: desc: null value: python_version: 3.10.13 cli_version: 0.16.6 framework: huggingface huggingface_version: 4.39.3 is_jupyter_run: true is_kaggle_kernel: true start_time: 1714484610.0 t: 1: - 1 - 2 - 3 - 5 - 11 - 12 - 49 - 51 - 53 - 55 - 71 - 98 - 105 2: - 1 - 2 - 3 - 5 - 11 - 12 - 49 - 51 - 53 - 55 - 71 - 98 - 105 3: - 7 - 23 - 62 4: 3.10.13 5: 0.16.6 6: 4.39.3 8: - 1 - 2 - 5 9: 1: transformers_trainer 13: linux-x86_64 m: - 1: train/global_step 6: - 3 - 1: train/loss 5: 1 6: - 1 - 1: train/grad_norm 5: 1 6: - 1 - 1: train/learning_rate 5: 1 6: - 1 - 1: train/epoch 5: 1 6: - 1 - 1: eval/loss 5: 1 6: - 1 - 1: eval/runtime 5: 1 6: - 1 - 1: eval/samples_per_second 5: 1 6: - 1 - 1: eval/steps_per_second 5: 1 6: - 1 vocab_size: desc: null value: 32064 hidden_size: desc: null value: 3072 intermediate_size: desc: null value: 8192 num_hidden_layers: desc: null value: 32 num_attention_heads: desc: null value: 32 num_key_value_heads: desc: null value: 32 resid_pdrop: desc: null value: 0.0 embd_pdrop: desc: null value: 0.0 attention_dropout: desc: null value: 0.0 hidden_act: desc: null value: silu max_position_embeddings: desc: null value: 131072 original_max_position_embeddings: desc: null value: 4096 initializer_range: desc: null value: 0.02 rms_norm_eps: desc: null value: 1.0e-05 use_cache: desc: null value: false rope_theta: desc: null value: 10000.0 rope_scaling: desc: null value: long_factor: - 1.0299999713897705 - 1.0499999523162842 - 1.0499999523162842 - 1.0799999237060547 - 1.2299998998641968 - 1.2299998998641968 - 1.2999999523162842 - 1.4499999284744263 - 1.5999999046325684 - 1.6499998569488525 - 1.8999998569488525 - 2.859999895095825 - 3.68999981880188 - 5.419999599456787 - 5.489999771118164 - 5.489999771118164 - 9.09000015258789 - 11.579999923706055 - 15.65999984741211 - 15.769999504089355 - 15.789999961853027 - 18.360000610351562 - 21.989999771118164 - 23.079999923706055 - 30.009998321533203 - 32.35000228881836 - 32.590003967285156 - 35.56000518798828 - 39.95000457763672 - 53.840003967285156 - 56.20000457763672 - 57.95000457763672 - 59.29000473022461 - 59.77000427246094 - 59.920005798339844 - 61.190006256103516 - 61.96000671386719 - 62.50000762939453 - 63.3700065612793 - 63.48000717163086 - 63.48000717163086 - 63.66000747680664 - 63.850006103515625 - 64.08000946044922 - 64.760009765625 - 64.80001068115234 - 64.81001281738281 - 64.81001281738281 short_factor: - 1.05 - 1.05 - 1.05 - 1.1 - 1.1 - 1.1500000000000001 - 1.2000000000000002 - 1.2500000000000002 - 1.3000000000000003 - 1.3500000000000003 - 1.5000000000000004 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.000000000000001 - 2.0500000000000007 - 2.0500000000000007 - 2.0500000000000007 - 2.1000000000000005 - 2.1000000000000005 - 2.1000000000000005 - 2.1500000000000004 - 2.1500000000000004 - 2.3499999999999996 - 2.549999999999999 - 2.5999999999999988 - 2.5999999999999988 - 2.7499999999999982 - 2.849999999999998 - 2.849999999999998 - 2.9499999999999975 type: su sliding_window: desc: null value: 262144 return_dict: desc: null value: true output_hidden_states: desc: null value: false output_attentions: desc: null value: false torchscript: desc: null value: false torch_dtype: desc: null value: bfloat16 use_bfloat16: desc: null value: false tf_legacy_loss: desc: null value: false pruned_heads: desc: null value: {} tie_word_embeddings: desc: null value: false chunk_size_feed_forward: desc: null value: 0 is_encoder_decoder: desc: null value: false is_decoder: desc: null value: false cross_attention_hidden_size: desc: null value: null add_cross_attention: desc: null value: false tie_encoder_decoder: desc: null value: false max_length: desc: null value: 20 min_length: desc: null value: 0 do_sample: desc: null value: false early_stopping: desc: null value: false num_beams: desc: null value: 1 num_beam_groups: desc: null value: 1 diversity_penalty: desc: null value: 0.0 temperature: desc: null value: 1.0 top_k: desc: null value: 50 top_p: desc: null value: 1.0 typical_p: desc: null value: 1.0 repetition_penalty: desc: null value: 1.0 length_penalty: desc: null value: 1.0 no_repeat_ngram_size: desc: null value: 0 encoder_no_repeat_ngram_size: desc: null value: 0 bad_words_ids: desc: null value: null num_return_sequences: desc: null value: 1 output_scores: desc: null value: false return_dict_in_generate: desc: null value: false forced_bos_token_id: desc: null value: null forced_eos_token_id: desc: null value: null remove_invalid_values: desc: null value: false exponential_decay_length_penalty: desc: null value: null suppress_tokens: desc: null value: null begin_suppress_tokens: desc: null value: null architectures: desc: null value: - Phi3ForCausalLM finetuning_task: desc: null value: null id2label: desc: null value: '0': LABEL_0 '1': LABEL_1 label2id: desc: null value: LABEL_0: 0 LABEL_1: 1 tokenizer_class: desc: null value: null prefix: desc: null value: null bos_token_id: desc: null value: 1 pad_token_id: desc: null value: 32000 eos_token_id: desc: null value: 32000 sep_token_id: desc: null value: null decoder_start_token_id: desc: null value: null task_specific_params: desc: null value: null problem_type: desc: null value: null _name_or_path: desc: null value: microsoft/Phi-3-mini-128k-instruct transformers_version: desc: null value: 4.39.3 auto_map: desc: null value: AutoConfig: microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config AutoModelForCausalLM: microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM model_type: desc: null value: phi3 output_dir: desc: null value: /kaggle/working/ overwrite_output_dir: desc: null value: false do_train: desc: null value: false do_eval: desc: null value: true do_predict: desc: null value: false evaluation_strategy: desc: null value: epoch prediction_loss_only: desc: null value: false per_device_train_batch_size: desc: null value: 6 per_device_eval_batch_size: desc: null value: 6 per_gpu_train_batch_size: desc: null value: null per_gpu_eval_batch_size: desc: null value: null gradient_accumulation_steps: desc: null value: 4 eval_accumulation_steps: desc: null value: null eval_delay: desc: null value: 0 learning_rate: desc: null value: 0.0002 weight_decay: desc: null value: 0.01 adam_beta1: desc: null value: 0.9 adam_beta2: desc: null value: 0.999 adam_epsilon: desc: null value: 1.0e-08 max_grad_norm: desc: null value: 1.0 num_train_epochs: desc: null value: 30 max_steps: desc: null value: -1 lr_scheduler_type: desc: null value: linear lr_scheduler_kwargs: desc: null value: {} warmup_ratio: desc: null value: 0.0 warmup_steps: desc: null value: 2 log_level: desc: null value: passive log_level_replica: desc: null value: warning log_on_each_node: desc: null value: true logging_dir: desc: null value: /kaggle/working/runs/Apr30_13-43-08_fde755c1ca53 logging_strategy: desc: null value: epoch logging_first_step: desc: null value: false logging_steps: desc: null value: 500 logging_nan_inf_filter: desc: null value: true save_strategy: desc: null value: epoch save_steps: desc: null value: 500 save_total_limit: desc: null value: null save_safetensors: desc: null value: true save_on_each_node: desc: null value: false save_only_model: desc: null value: false no_cuda: desc: null value: false use_cpu: desc: null value: false use_mps_device: desc: null value: false seed: desc: null value: 42 data_seed: desc: null value: null jit_mode_eval: desc: null value: false use_ipex: desc: null value: false bf16: desc: null value: false fp16: desc: null value: true fp16_opt_level: desc: null value: O1 half_precision_backend: desc: null value: auto bf16_full_eval: desc: null value: false fp16_full_eval: desc: null value: false tf32: desc: null value: null local_rank: desc: null value: 0 ddp_backend: desc: null value: null tpu_num_cores: desc: null value: null tpu_metrics_debug: desc: null value: false debug: desc: null value: [] dataloader_drop_last: desc: null value: false eval_steps: desc: null value: null dataloader_num_workers: desc: null value: 0 dataloader_prefetch_factor: desc: null value: null past_index: desc: null value: -1 run_name: desc: null value: /kaggle/working/ disable_tqdm: desc: null value: false remove_unused_columns: desc: null value: true label_names: desc: null value: null load_best_model_at_end: desc: null value: true metric_for_best_model: desc: null value: loss greater_is_better: desc: null value: false ignore_data_skip: desc: null value: false fsdp: desc: null value: [] fsdp_min_num_params: desc: null value: 0 fsdp_config: desc: null value: min_num_params: 0 xla: false xla_fsdp_v2: false xla_fsdp_grad_ckpt: false fsdp_transformer_layer_cls_to_wrap: desc: null value: null accelerator_config: desc: null value: split_batches: false dispatch_batches: null even_batches: true use_seedable_sampler: true deepspeed: desc: null value: null label_smoothing_factor: desc: null value: 0.0 optim: desc: null value: paged_adamw_8bit optim_args: desc: null value: null adafactor: desc: null value: false group_by_length: desc: null value: false length_column_name: desc: null value: length report_to: desc: null value: - tensorboard - wandb ddp_find_unused_parameters: desc: null value: null ddp_bucket_cap_mb: desc: null value: null ddp_broadcast_buffers: desc: null value: null dataloader_pin_memory: desc: null value: true dataloader_persistent_workers: desc: null value: false skip_memory_metrics: desc: null value: true use_legacy_prediction_loop: desc: null value: false push_to_hub: desc: null value: false resume_from_checkpoint: desc: null value: null hub_model_id: desc: null value: null hub_strategy: desc: null value: every_save hub_token: desc: null value: hub_private_repo: desc: null value: false hub_always_push: desc: null value: false gradient_checkpointing: desc: null value: false gradient_checkpointing_kwargs: desc: null value: null include_inputs_for_metrics: desc: null value: false fp16_backend: desc: null value: auto push_to_hub_model_id: desc: null value: null push_to_hub_organization: desc: null value: null push_to_hub_token: desc: null value: mp_parameters: desc: null value: '' auto_find_batch_size: desc: null value: false full_determinism: desc: null value: false torchdynamo: desc: null value: null ray_scope: desc: null value: last ddp_timeout: desc: null value: 1800 torch_compile: desc: null value: false torch_compile_backend: desc: null value: null torch_compile_mode: desc: null value: null dispatch_batches: desc: null value: null split_batches: desc: null value: null include_tokens_per_second: desc: null value: false include_num_input_tokens_seen: desc: null value: false neftune_noise_alpha: desc: null value: null optim_target_modules: desc: null value: null