sanchit-gandhi HF staff commited on
Commit
d2861b2
1 Parent(s): 0ae2f86

Training in progress, epoch 1

Browse files
config_full.yaml CHANGED
@@ -22,25 +22,24 @@ gradient_checkpointing: true
22
  gradient_checkpointing_kwargs:
23
  use_reentrant: False
24
  hub_strategy: every_save
25
- learning_rate: 2.0e-05
26
  log_level: info
27
- logging_steps: 5
28
  logging_strategy: steps
29
  lr_scheduler_type: cosine
30
  max_seq_length: 2048
31
  max_steps: -1
32
- num_train_epochs: 1
33
  output_dir: ./
34
  overwrite_output_dir: true
35
- per_device_eval_batch_size: 8
36
- per_device_train_batch_size: 16
37
  push_to_hub: true
38
  remove_unused_columns: true
39
  report_to:
40
  - tensorboard
41
  - wandb
42
- save_strategy: "steps"
43
- save_steps: 100
44
  save_total_limit: 1
45
  seed: 42
46
- warmup_ratio: 0.1
 
22
  gradient_checkpointing_kwargs:
23
  use_reentrant: False
24
  hub_strategy: every_save
25
+ learning_rate: 3.0e-04
26
  log_level: info
27
+ logging_steps: 10
28
  logging_strategy: steps
29
  lr_scheduler_type: cosine
30
  max_seq_length: 2048
31
  max_steps: -1
32
+ num_train_epochs: 5
33
  output_dir: ./
34
  overwrite_output_dir: true
35
+ per_device_eval_batch_size: 32
36
+ per_device_train_batch_size: 64
37
  push_to_hub: true
38
  remove_unused_columns: true
39
  report_to:
40
  - tensorboard
41
  - wandb
42
+ save_strategy: "epoch"
 
43
  save_total_limit: 1
44
  seed: 42
45
+ warmup_ratio: 0.1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b92d043a09b94d7a22c478b47119a6bad9b5e914d6e9d6e6a566e2c9cc0a2be
3
  size 3141646744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b36e69ffbe5ae42f8478ebfbc7c07012a026dc6fbd9407f024781cecd7b9d168
3
  size 3141646744
runs/Feb01_17-38-02_ip-26-0-165-24/events.out.tfevents.1706809106.ip-26-0-165-24.237059.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:150e10d36280c0b789cf11077e52e9dfc0006692f1a394589a62f320ce274b43
3
- size 32714
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9a0969790c64d9b70fd83ebdd3bc06a1dee3d99e192778e00b366d5ec468d17
3
+ size 36796
runs/Feb01_17-58-13_ip-26-0-165-24/events.out.tfevents.1706810328.ip-26-0-165-24.239318.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6335f8b18c2d252b1614b2ce15bdaae4fa5dfa041da34f64c37eeab8379c14b0
3
+ size 9006
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85527db3b58440cee04521e4eb775e7259e847cd1a669eaf7502bcc6b6feb0ca
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3862821a4246455e69d6e9d62177958c9fe5579bd61be2e28572b81437bb7922
3
  size 5816
wandb/debug-cli.sanchit.log ADDED
File without changes
wandb/debug-internal.log CHANGED
The diff for this file is too large to render. See raw diff
 
wandb/debug.log CHANGED
@@ -1,28 +1,28 @@
1
- 2024-02-01 17:38:28,434 INFO MainThread:237059 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
2
- 2024-02-01 17:38:28,434 INFO MainThread:237059 [wandb_setup.py:_flush():76] Configure stats pid to 237059
3
- 2024-02-01 17:38:28,434 INFO MainThread:237059 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
- 2024-02-01 17:38:28,434 INFO MainThread:237059 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/settings
5
- 2024-02-01 17:38:28,434 INFO MainThread:237059 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
- 2024-02-01 17:38:28,434 INFO MainThread:237059 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
- 2024-02-01 17:38:28,434 INFO MainThread:237059 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft/run_sft.py'}
8
- 2024-02-01 17:38:28,435 INFO MainThread:237059 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/logs/debug.log
9
- 2024-02-01 17:38:28,435 INFO MainThread:237059 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/logs/debug-internal.log
10
- 2024-02-01 17:38:28,435 INFO MainThread:237059 [wandb_init.py:init():564] calling init triggers
11
- 2024-02-01 17:38:28,435 INFO MainThread:237059 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
12
  config: {}
13
- 2024-02-01 17:38:28,435 INFO MainThread:237059 [wandb_init.py:init():614] starting backend
14
- 2024-02-01 17:38:28,435 INFO MainThread:237059 [wandb_init.py:init():618] setting up manager
15
- 2024-02-01 17:38:28,441 INFO MainThread:237059 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
- 2024-02-01 17:38:28,451 INFO MainThread:237059 [wandb_init.py:init():624] backend started and connected
17
- 2024-02-01 17:38:28,453 INFO MainThread:237059 [wandb_init.py:init():716] updated telemetry
18
- 2024-02-01 17:38:28,475 INFO MainThread:237059 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
19
- 2024-02-01 17:38:28,720 INFO MainThread:237059 [wandb_run.py:_on_init():2254] communicating current version
20
- 2024-02-01 17:38:28,767 INFO MainThread:237059 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.2 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
 
22
- 2024-02-01 17:38:28,767 INFO MainThread:237059 [wandb_init.py:init():800] starting run threads in backend
23
- 2024-02-01 17:38:34,465 INFO MainThread:237059 [wandb_run.py:_console_start():2233] atexit reg
24
- 2024-02-01 17:38:34,465 INFO MainThread:237059 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
- 2024-02-01 17:38:34,466 INFO MainThread:237059 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
- 2024-02-01 17:38:34,466 INFO MainThread:237059 [wandb_run.py:_redirect():2178] Redirects installed.
27
- 2024-02-01 17:38:34,467 INFO MainThread:237059 [wandb_init.py:init():841] run started, returning control to user process
28
- 2024-02-01 17:38:34,468 INFO MainThread:237059 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.36.2', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Feb01_17-38-02_ip-26-0-165-24', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'max_seq_length': 2048}
 
1
+ 2024-02-01 17:58:50,075 INFO MainThread:239318 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
2
+ 2024-02-01 17:58:50,075 INFO MainThread:239318 [wandb_setup.py:_flush():76] Configure stats pid to 239318
3
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/settings
5
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft/run_sft.py'}
8
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_175850-i93q0p12/logs/debug.log
9
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_175850-i93q0p12/logs/debug-internal.log
10
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:init():564] calling init triggers
11
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
12
  config: {}
13
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:init():614] starting backend
14
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:init():618] setting up manager
15
+ 2024-02-01 17:58:50,079 INFO MainThread:239318 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-02-01 17:58:50,083 INFO MainThread:239318 [wandb_init.py:init():624] backend started and connected
17
+ 2024-02-01 17:58:50,087 INFO MainThread:239318 [wandb_init.py:init():716] updated telemetry
18
+ 2024-02-01 17:58:50,107 INFO MainThread:239318 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
19
+ 2024-02-01 17:58:50,409 INFO MainThread:239318 [wandb_run.py:_on_init():2254] communicating current version
20
+ 2024-02-01 17:58:50,456 INFO MainThread:239318 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.2 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
 
22
+ 2024-02-01 17:58:50,456 INFO MainThread:239318 [wandb_init.py:init():800] starting run threads in backend
23
+ 2024-02-01 17:58:55,347 INFO MainThread:239318 [wandb_run.py:_console_start():2233] atexit reg
24
+ 2024-02-01 17:58:55,347 INFO MainThread:239318 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
+ 2024-02-01 17:58:55,348 INFO MainThread:239318 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
+ 2024-02-01 17:58:55,348 INFO MainThread:239318 [wandb_run.py:_redirect():2178] Redirects installed.
27
+ 2024-02-01 17:58:55,348 INFO MainThread:239318 [wandb_init.py:init():841] run started, returning control to user process
28
+ 2024-02-01 17:58:55,349 INFO MainThread:239318 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.36.2', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 64, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Feb01_17-58-13_ip-26-0-165-24', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'max_seq_length': 2048}
wandb/run-20240201_173828-py26nu6m/files/output.log CHANGED
@@ -800,3 +800,66 @@
800
 
801
 
802
  88%|██████████████████████████████████████████████████████████████████████▋ | 963/1090 [17:05<01:57, 1.08it/s]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
800
 
801
 
802
  88%|██████████████████████████████████████████████████████████████████████▋ | 963/1090 [17:05<01:57, 1.08it/s]
803
+
804
+
805
+
806
+ 89%|███████████████████████████████████████████████████████████████████████▏ | 970/1090 [17:11<01:51, 1.08it/s]
807
+
808
+
809
+ 89%|███████████████████████████████████████████████████████████████████████▍ | 974/1090 [17:15<01:47, 1.08it/s]
810
+
811
+
812
+ 90%|███████████████████████████████████████████████████████████████████████▊ | 979/1090 [17:20<01:42, 1.08it/s]
813
+
814
+
815
+
816
+ 90%|████████████████████████████████████████████████████████████████████████▎ | 985/1090 [17:25<01:37, 1.08it/s]
817
+
818
+
819
+ 91%|████████████████████████████████████████████████████████████████████████▌ | 989/1090 [17:29<01:33, 1.08it/s]
820
+
821
+
822
+ 91%|████████████████████████████████████████████████████████████████████████▉ | 994/1090 [17:33<01:28, 1.08it/s]
823
+
824
+
825
+
826
+ 92%|████████████████████████████████████████████████████████████████████████▍ | 1000/1090 [17:39<01:23, 1.08it/s]
827
+ 92%|████████████████████████████████████████████████████████████████████████▍ | 1000/1090 [17:39<01:23, 1.08it/s][INFO|trainer.py:2889] 2024-02-01 17:56:15,101 >> Saving model checkpoint to ./tmp-checkpoint-1000
828
+ [INFO|configuration_utils.py:483] 2024-02-01 17:56:15,105 >> Configuration saved in ./tmp-checkpoint-1000/config.json
829
+ [INFO|configuration_utils.py:594] 2024-02-01 17:56:15,107 >> Configuration saved in ./tmp-checkpoint-1000/generation_config.json
830
+ [INFO|modeling_utils.py:2382] 2024-02-01 17:56:18,392 >> Model weights saved in ./tmp-checkpoint-1000/pytorch_model.bin
831
+ [INFO|tokenization_utils_base.py:2432] 2024-02-01 17:56:18,395 >> tokenizer config file saved in ./tmp-checkpoint-1000/tokenizer_config.json
832
+ [INFO|tokenization_utils_base.py:2441] 2024-02-01 17:56:18,397 >> Special tokens file saved in ./tmp-checkpoint-1000/special_tokens_map.json
833
+ /fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
834
+ warnings.warn(
835
+ [2024-02-01 17:56:18,423] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is about to be saved!
836
+ [2024-02-01 17:56:18,427] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt
837
+ [2024-02-01 17:56:18,427] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt...
838
+ [2024-02-01 17:56:18,432] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt.
839
+ [2024-02-01 17:56:18,439] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
840
+ [2024-02-01 17:56:22,348] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
841
+ [2024-02-01 17:56:22,353] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
842
+ [2024-02-01 17:56:22,553] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1000 is ready now!
843
+ [INFO|tokenization_utils_base.py:2432] 2024-02-01 17:56:25,394 >> tokenizer config file saved in ./tokenizer_config.json
844
+ [INFO|tokenization_utils_base.py:2441] 2024-02-01 17:56:25,396 >> Special tokens file saved in ./special_tokens_map.json
845
+ [INFO|trainer.py:2979] 2024-02-01 17:56:25,422 >> Deleting older checkpoint [checkpoint-900] due to args.save_total_limit
846
+
847
+ 92%|████████████████████████████████████████████████████████████████████████▋ | 1003/1090 [17:57<04:31, 3.12s/it]
848
+
849
+
850
+
851
+ 93%|█████████████████████████████████████████████████████████████████████████▏ | 1010/1090 [18:03<01:28, 1.10s/it]
852
+
853
+
854
+ 93%|█████████████████████████████████████████████████████████████████████████▍ | 1014/1090 [18:07<01:13, 1.03it/s]
855
+
856
+
857
+ 93%|█████████████████████████████████████████████████████████████████████████▊ | 1019/1090 [18:12<01:06, 1.07it/s]
858
+
859
+
860
+
861
+ 94%|██████████████████████████████████████████████████████████████████████████▎ | 1025/1090 [18:17<01:00, 1.08it/s]
862
+
863
+
864
+ 94%|██████████████████████████████████████████████████████████████████████████▌ | 1029/1090 [18:21<00:56, 1.08it/s]
865
+
wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json CHANGED
@@ -1 +1 @@
1
- {"train/loss": 2.475, "train/learning_rate": 7.905777244947954e-07, "train/epoch": 0.89, "train/global_step": 965, "_timestamp": 1706810141.6438708, "_runtime": 1033.1919968128204, "_step": 193}
 
1
+ {"train/loss": 2.4478, "train/learning_rate": 1.840338184455881e-07, "train/epoch": 0.94, "train/global_step": 1030, "_timestamp": 1706810216.6790407, "_runtime": 1108.2271666526794, "_step": 206}
wandb/run-20240201_173828-py26nu6m/logs/debug-internal.log CHANGED
@@ -1933,3 +1933,144 @@
1933
  2024-02-01 17:55:41,645 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1934
  2024-02-01 17:55:41,645 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1935
  2024-02-01 17:55:41,647 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1933
  2024-02-01 17:55:41,645 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1934
  2024-02-01 17:55:41,645 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1935
  2024-02-01 17:55:41,647 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
1936
+ 2024-02-01 17:55:42,330 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
1937
+ 2024-02-01 17:55:42,330 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1938
+ 2024-02-01 17:55:43,503 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
1939
+ 2024-02-01 17:55:45,335 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1940
+ 2024-02-01 17:55:46,281 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
1941
+ 2024-02-01 17:55:46,282 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1942
+ 2024-02-01 17:55:46,282 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1943
+ 2024-02-01 17:55:46,284 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
1944
+ 2024-02-01 17:55:46,337 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
1945
+ 2024-02-01 17:55:47,338 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1946
+ 2024-02-01 17:55:48,339 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1947
+ 2024-02-01 17:55:49,056 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
1948
+ 2024-02-01 17:55:50,906 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
1949
+ 2024-02-01 17:55:50,907 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1950
+ 2024-02-01 17:55:50,907 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1951
+ 2024-02-01 17:55:50,909 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
1952
+ 2024-02-01 17:55:50,992 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: internal_messages
1953
+ 2024-02-01 17:55:51,001 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: stop_status
1954
+ 2024-02-01 17:55:51,001 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: stop_status
1955
+ 2024-02-01 17:55:51,344 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
1956
+ 2024-02-01 17:55:51,344 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1957
+ 2024-02-01 17:55:52,345 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1958
+ 2024-02-01 17:55:54,600 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
1959
+ 2024-02-01 17:55:55,349 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1960
+ 2024-02-01 17:55:55,524 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
1961
+ 2024-02-01 17:55:55,525 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1962
+ 2024-02-01 17:55:55,526 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1963
+ 2024-02-01 17:55:55,527 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
1964
+ 2024-02-01 17:55:56,351 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
1965
+ 2024-02-01 17:55:56,352 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1966
+ 2024-02-01 17:55:58,964 DEBUG SenderThread:237521 [sender.py:send():382] send: stats
1967
+ 2024-02-01 17:55:59,355 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1968
+ 2024-02-01 17:56:00,151 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
1969
+ 2024-02-01 17:56:00,153 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
1970
+ 2024-02-01 17:56:00,154 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1971
+ 2024-02-01 17:56:00,154 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1972
+ 2024-02-01 17:56:00,156 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
1973
+ 2024-02-01 17:56:00,358 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
1974
+ 2024-02-01 17:56:01,359 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1975
+ 2024-02-01 17:56:02,360 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1976
+ 2024-02-01 17:56:04,771 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
1977
+ 2024-02-01 17:56:04,772 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1978
+ 2024-02-01 17:56:04,772 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1979
+ 2024-02-01 17:56:04,774 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
1980
+ 2024-02-01 17:56:05,365 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
1981
+ 2024-02-01 17:56:05,365 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1982
+ 2024-02-01 17:56:05,693 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
1983
+ 2024-02-01 17:56:05,992 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: internal_messages
1984
+ 2024-02-01 17:56:06,001 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: stop_status
1985
+ 2024-02-01 17:56:06,001 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: stop_status
1986
+ 2024-02-01 17:56:06,366 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1987
+ 2024-02-01 17:56:09,370 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1988
+ 2024-02-01 17:56:09,385 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
1989
+ 2024-02-01 17:56:09,387 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1990
+ 2024-02-01 17:56:09,387 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1991
+ 2024-02-01 17:56:09,388 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
1992
+ 2024-02-01 17:56:10,373 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
1993
+ 2024-02-01 17:56:10,373 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1994
+ 2024-02-01 17:56:11,235 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
1995
+ 2024-02-01 17:56:13,376 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
1996
+ 2024-02-01 17:56:14,003 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
1997
+ 2024-02-01 17:56:14,004 DEBUG SenderThread:237521 [sender.py:send():382] send: history
1998
+ 2024-02-01 17:56:14,005 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
1999
+ 2024-02-01 17:56:14,007 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
2000
+ 2024-02-01 17:56:14,379 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
2001
+ 2024-02-01 17:56:15,380 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2002
+ 2024-02-01 17:56:16,381 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2003
+ 2024-02-01 17:56:17,108 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
2004
+ 2024-02-01 17:56:19,385 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2005
+ 2024-02-01 17:56:20,388 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2006
+ 2024-02-01 17:56:22,348 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: internal_messages
2007
+ 2024-02-01 17:56:22,349 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
2008
+ 2024-02-01 17:56:22,349 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: stop_status
2009
+ 2024-02-01 17:56:22,350 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: stop_status
2010
+ 2024-02-01 17:56:24,393 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2011
+ 2024-02-01 17:56:27,397 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2012
+ 2024-02-01 17:56:27,423 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
2013
+ 2024-02-01 17:56:28,965 DEBUG SenderThread:237521 [sender.py:send():382] send: stats
2014
+ 2024-02-01 17:56:31,402 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2015
+ 2024-02-01 17:56:32,636 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
2016
+ 2024-02-01 17:56:33,405 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2017
+ 2024-02-01 17:56:33,565 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
2018
+ 2024-02-01 17:56:33,567 DEBUG SenderThread:237521 [sender.py:send():382] send: history
2019
+ 2024-02-01 17:56:33,567 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
2020
+ 2024-02-01 17:56:33,569 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
2021
+ 2024-02-01 17:56:34,407 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
2022
+ 2024-02-01 17:56:34,407 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2023
+ 2024-02-01 17:56:36,403 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: internal_messages
2024
+ 2024-02-01 17:56:36,403 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: stop_status
2025
+ 2024-02-01 17:56:36,403 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: stop_status
2026
+ 2024-02-01 17:56:37,412 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2027
+ 2024-02-01 17:56:38,179 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
2028
+ 2024-02-01 17:56:38,180 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
2029
+ 2024-02-01 17:56:38,181 DEBUG SenderThread:237521 [sender.py:send():382] send: history
2030
+ 2024-02-01 17:56:38,181 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
2031
+ 2024-02-01 17:56:38,183 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
2032
+ 2024-02-01 17:56:38,414 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
2033
+ 2024-02-01 17:56:39,415 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2034
+ 2024-02-01 17:56:40,416 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2035
+ 2024-02-01 17:56:42,804 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
2036
+ 2024-02-01 17:56:42,805 DEBUG SenderThread:237521 [sender.py:send():382] send: history
2037
+ 2024-02-01 17:56:42,806 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
2038
+ 2024-02-01 17:56:42,808 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
2039
+ 2024-02-01 17:56:43,421 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
2040
+ 2024-02-01 17:56:43,421 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2041
+ 2024-02-01 17:56:43,727 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
2042
+ 2024-02-01 17:56:44,423 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2043
+ 2024-02-01 17:56:47,427 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2044
+ 2024-02-01 17:56:47,430 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
2045
+ 2024-02-01 17:56:47,432 DEBUG SenderThread:237521 [sender.py:send():382] send: history
2046
+ 2024-02-01 17:56:47,432 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
2047
+ 2024-02-01 17:56:47,434 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
2048
+ 2024-02-01 17:56:48,429 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
2049
+ 2024-02-01 17:56:48,429 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2050
+ 2024-02-01 17:56:49,277 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
2051
+ 2024-02-01 17:56:51,403 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: internal_messages
2052
+ 2024-02-01 17:56:51,403 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: stop_status
2053
+ 2024-02-01 17:56:51,404 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: stop_status
2054
+ 2024-02-01 17:56:51,432 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2055
+ 2024-02-01 17:56:52,098 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
2056
+ 2024-02-01 17:56:52,100 DEBUG SenderThread:237521 [sender.py:send():382] send: history
2057
+ 2024-02-01 17:56:52,100 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
2058
+ 2024-02-01 17:56:52,102 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
2059
+ 2024-02-01 17:56:52,435 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
2060
+ 2024-02-01 17:56:53,436 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2061
+ 2024-02-01 17:56:54,437 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2062
+ 2024-02-01 17:56:54,828 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
2063
+ 2024-02-01 17:56:56,679 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: partial_history
2064
+ 2024-02-01 17:56:56,680 DEBUG SenderThread:237521 [sender.py:send():382] send: history
2065
+ 2024-02-01 17:56:56,681 DEBUG SenderThread:237521 [sender.py:send_request():409] send_request: summary_record
2066
+ 2024-02-01 17:56:56,683 INFO SenderThread:237521 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
2067
+ 2024-02-01 17:56:57,442 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/wandb-summary.json
2068
+ 2024-02-01 17:56:57,443 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2069
+ 2024-02-01 17:56:58,444 INFO Thread-12 :237521 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_173828-py26nu6m/files/output.log
2070
+ 2024-02-01 17:56:58,968 DEBUG SenderThread:237521 [sender.py:send():382] send: stats
2071
+ 2024-02-01 17:56:59,014 INFO memory :237521 [interfaces.py:monitor():140] Process proc.memory.rssMB has exited.
2072
+ 2024-02-01 17:56:59,015 DEBUG SystemMonitor:237521 [system_monitor.py:_start():179] Finished system metrics aggregation loop
2073
+ 2024-02-01 17:56:59,015 DEBUG SystemMonitor:237521 [system_monitor.py:_start():183] Publishing last batch of metrics
2074
+ 2024-02-01 17:56:59,026 DEBUG SenderThread:237521 [sender.py:send():382] send: stats
2075
+ 2024-02-01 17:56:59,959 INFO MainThread:237521 [internal.py:handle_exit():76] Internal process exited
2076
+ 2024-02-01 17:57:00,027 DEBUG HandlerThread:237521 [handler.py:handle_request():146] handle_request: status_report
wandb/run-20240201_173828-py26nu6m/run-py26nu6m.wandb CHANGED
Binary files a/wandb/run-20240201_173828-py26nu6m/run-py26nu6m.wandb and b/wandb/run-20240201_173828-py26nu6m/run-py26nu6m.wandb differ
 
wandb/run-20240201_175850-i93q0p12/files/conda-environment.yaml ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: venv
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - defaults
6
+ dependencies:
7
+ - _libgcc_mutex=0.1=main
8
+ - _openmp_mutex=5.1=1_gnu
9
+ - blas=1.0=mkl
10
+ - brotli-python=1.0.9=py311h6a678d5_7
11
+ - bzip2=1.0.8=h7b6447c_0
12
+ - ca-certificates=2023.12.12=h06a4308_0
13
+ - certifi=2023.11.17=py311h06a4308_0
14
+ - cffi=1.16.0=py311h5eee18b_0
15
+ - cryptography=41.0.7=py311hdda0065_0
16
+ - cuda-cudart=12.1.105=0
17
+ - cuda-cupti=12.1.105=0
18
+ - cuda-libraries=12.1.0=0
19
+ - cuda-nvrtc=12.1.105=0
20
+ - cuda-nvtx=12.1.105=0
21
+ - cuda-opencl=12.3.101=0
22
+ - cuda-runtime=12.1.0=0
23
+ - ffmpeg=4.3=hf484d3e_0
24
+ - filelock=3.13.1=py311h06a4308_0
25
+ - freetype=2.12.1=h4a9f257_0
26
+ - giflib=5.2.1=h5eee18b_3
27
+ - gmp=6.2.1=h295c915_3
28
+ - gmpy2=2.1.2=py311hc9b5ff0_0
29
+ - gnutls=3.6.15=he1e5248_0
30
+ - intel-openmp=2023.1.0=hdb19cb5_46306
31
+ - jinja2=3.1.2=py311h06a4308_0
32
+ - jpeg=9e=h5eee18b_1
33
+ - lame=3.100=h7b6447c_0
34
+ - lcms2=2.12=h3be6417_0
35
+ - ld_impl_linux-64=2.38=h1181459_1
36
+ - lerc=3.0=h295c915_0
37
+ - libcublas=12.1.0.26=0
38
+ - libcufft=11.0.2.4=0
39
+ - libcufile=1.8.1.2=0
40
+ - libcurand=10.3.4.101=0
41
+ - libcusolver=11.4.4.55=0
42
+ - libcusparse=12.0.2.55=0
43
+ - libdeflate=1.17=h5eee18b_1
44
+ - libffi=3.4.4=h6a678d5_0
45
+ - libgcc-ng=11.2.0=h1234567_1
46
+ - libgomp=11.2.0=h1234567_1
47
+ - libiconv=1.16=h7f8727e_2
48
+ - libidn2=2.3.4=h5eee18b_0
49
+ - libjpeg-turbo=2.0.0=h9bf148f_0
50
+ - libnpp=12.0.2.50=0
51
+ - libnvjitlink=12.1.105=0
52
+ - libnvjpeg=12.1.1.14=0
53
+ - libpng=1.6.39=h5eee18b_0
54
+ - libstdcxx-ng=11.2.0=h1234567_1
55
+ - libtasn1=4.19.0=h5eee18b_0
56
+ - libtiff=4.5.1=h6a678d5_0
57
+ - libunistring=0.9.10=h27cfd23_0
58
+ - libuuid=1.41.5=h5eee18b_0
59
+ - libwebp=1.3.2=h11a3e52_0
60
+ - libwebp-base=1.3.2=h5eee18b_0
61
+ - llvm-openmp=14.0.6=h9e868ea_0
62
+ - lz4-c=1.9.4=h6a678d5_0
63
+ - markupsafe=2.1.1=py311h5eee18b_0
64
+ - mkl=2023.1.0=h213fc3f_46344
65
+ - mkl-service=2.4.0=py311h5eee18b_1
66
+ - mkl_fft=1.3.8=py311h5eee18b_0
67
+ - mkl_random=1.2.4=py311hdb19cb5_0
68
+ - mpc=1.1.0=h10f8cd9_1
69
+ - mpfr=4.0.2=hb69a4c5_1
70
+ - mpmath=1.3.0=py311h06a4308_0
71
+ - ncurses=6.4=h6a678d5_0
72
+ - nettle=3.7.3=hbbd107a_1
73
+ - networkx=3.1=py311h06a4308_0
74
+ - numpy=1.26.2=py311h08b1b3b_0
75
+ - numpy-base=1.26.2=py311hf175353_0
76
+ - openh264=2.1.1=h4ff587b_0
77
+ - openjpeg=2.4.0=h3ad879b_0
78
+ - openssl=3.0.12=h7f8727e_0
79
+ - pycparser=2.21=pyhd3eb1b0_0
80
+ - pyopenssl=23.2.0=py311h06a4308_0
81
+ - pysocks=1.7.1=py311h06a4308_0
82
+ - python=3.11.5=h955ad1f_0
83
+ - pytorch=2.1.2=py3.11_cuda12.1_cudnn8.9.2_0
84
+ - pytorch-cuda=12.1=ha16c6d3_5
85
+ - pytorch-mutex=1.0=cuda
86
+ - pyyaml=6.0.1=py311h5eee18b_0
87
+ - readline=8.2=h5eee18b_0
88
+ - requests=2.31.0=py311h06a4308_0
89
+ - setuptools=68.2.2=py311h06a4308_0
90
+ - sqlite=3.41.2=h5eee18b_0
91
+ - sympy=1.12=py311h06a4308_0
92
+ - tbb=2021.8.0=hdb19cb5_0
93
+ - tk=8.6.12=h1ccaba5_0
94
+ - torchaudio=2.1.2=py311_cu121
95
+ - torchtriton=2.1.0=py311
96
+ - torchvision=0.16.2=py311_cu121
97
+ - typing_extensions=4.7.1=py311h06a4308_0
98
+ - wheel=0.41.2=py311h06a4308_0
99
+ - xz=5.4.5=h5eee18b_0
100
+ - yaml=0.2.5=h7b6447c_0
101
+ - zlib=1.2.13=h5eee18b_0
102
+ - zstd=1.5.5=hc292b87_0
103
+ - pip:
104
+ - absl-py==2.0.0
105
+ - accelerate==0.23.0
106
+ - aiohttp==3.9.1
107
+ - aiosignal==1.3.1
108
+ - annotated-types==0.6.0
109
+ - appdirs==1.4.4
110
+ - astunparse==1.6.3
111
+ - attrs==23.1.0
112
+ - audioread==3.0.1
113
+ - bitsandbytes==0.41.2.post2
114
+ - cachetools==5.3.2
115
+ - chardet==5.2.0
116
+ - charset-normalizer==3.3.2
117
+ - click==8.1.7
118
+ - datasets==2.14.6
119
+ - decorator==5.1.1
120
+ - deepspeed==0.12.2
121
+ - dill==0.3.7
122
+ - docker-pycreds==0.4.0
123
+ - docstring-parser==0.15
124
+ - einops==0.7.0
125
+ - evaluate==0.4.0
126
+ - flash-attn==2.5.2
127
+ - flatbuffers==23.5.26
128
+ - frozenlist==1.4.1
129
+ - fsspec==2023.10.0
130
+ - gast==0.5.4
131
+ - gitdb==4.0.11
132
+ - gitpython==3.1.40
133
+ - google-auth==2.26.1
134
+ - google-auth-oauthlib==1.2.0
135
+ - google-pasta==0.2.0
136
+ - grpcio==1.60.0
137
+ - h5py==3.10.0
138
+ - hf-transfer==0.1.5
139
+ - hjson==3.1.0
140
+ - huggingface-hub==0.20.1
141
+ - idna==3.6
142
+ - jiwer==3.0.3
143
+ - joblib==1.3.2
144
+ - keras==2.15.0
145
+ - lazy-loader==0.3
146
+ - libclang==16.0.6
147
+ - librosa==0.10.1
148
+ - llvmlite==0.41.1
149
+ - markdown==3.5.1
150
+ - markdown-it-py==3.0.0
151
+ - mdurl==0.1.2
152
+ - ml-dtypes==0.2.0
153
+ - msgpack==1.0.7
154
+ - multidict==6.0.4
155
+ - multiprocess==0.70.15
156
+ - ninja==1.11.1.1
157
+ - nltk==3.8.1
158
+ - numba==0.58.1
159
+ - oauthlib==3.2.2
160
+ - opt-einsum==3.3.0
161
+ - packaging==23.2
162
+ - pandas==2.1.4
163
+ - peft==0.7.1
164
+ - pillow==10.2.0
165
+ - pip==23.3.2
166
+ - platformdirs==4.1.0
167
+ - pooch==1.8.0
168
+ - protobuf==3.20.2
169
+ - psutil==5.9.7
170
+ - py-cpuinfo==9.0.0
171
+ - pyarrow==14.0.2
172
+ - pyarrow-hotfix==0.6
173
+ - pyasn1==0.5.1
174
+ - pyasn1-modules==0.3.0
175
+ - pydantic==2.6.0
176
+ - pydantic-core==2.16.1
177
+ - pygments==2.17.2
178
+ - pynvml==11.5.0
179
+ - python-dateutil==2.8.2
180
+ - pytz==2023.3.post1
181
+ - rapidfuzz==3.6.1
182
+ - regex==2023.12.25
183
+ - requests-oauthlib==1.3.1
184
+ - responses==0.18.0
185
+ - rich==13.7.0
186
+ - rsa==4.9
187
+ - safetensors==0.4.1
188
+ - scikit-learn==1.3.2
189
+ - scipy==1.11.4
190
+ - sentencepiece==0.1.99
191
+ - sentry-sdk==1.39.1
192
+ - setproctitle==1.3.3
193
+ - shtab==1.6.5
194
+ - six==1.16.0
195
+ - smmap==5.0.1
196
+ - soundfile==0.12.1
197
+ - soxr==0.3.7
198
+ - tensorboard==2.15.1
199
+ - tensorboard-data-server==0.7.2
200
+ - tensorflow-cpu==2.15.0.post1
201
+ - tensorflow-estimator==2.15.0
202
+ - tensorflow-io-gcs-filesystem==0.35.0
203
+ - termcolor==2.4.0
204
+ - threadpoolctl==3.2.0
205
+ - tokenizers==0.15.0
206
+ - tqdm==4.66.1
207
+ - transformers==4.36.2
208
+ - trl==0.7.7
209
+ - typing-extensions==4.9.0
210
+ - tyro==0.7.0
211
+ - tzdata==2023.3
212
+ - urllib3==2.1.0
213
+ - wandb==0.16.1
214
+ - werkzeug==3.0.1
215
+ - wrapt==1.14.1
216
+ - xxhash==3.4.1
217
+ - yarl==1.9.4
218
+ prefix: /fsx/sanchit/miniconda3/envs/venv
wandb/run-20240201_175850-i93q0p12/files/config.yaml ADDED
@@ -0,0 +1,660 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.11.5
7
+ cli_version: 0.16.1
8
+ framework: huggingface
9
+ huggingface_version: 4.36.2
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1706810330.083967
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 2
17
+ - 3
18
+ - 5
19
+ - 11
20
+ - 49
21
+ - 51
22
+ - 53
23
+ - 55
24
+ - 71
25
+ - 84
26
+ - 98
27
+ 2:
28
+ - 1
29
+ - 2
30
+ - 3
31
+ - 5
32
+ - 11
33
+ - 49
34
+ - 51
35
+ - 53
36
+ - 55
37
+ - 71
38
+ - 84
39
+ - 98
40
+ 3:
41
+ - 7
42
+ - 23
43
+ 4: 3.11.5
44
+ 5: 0.16.1
45
+ 6: 4.36.2
46
+ 8:
47
+ - 5
48
+ 9:
49
+ 1: transformers_trainer
50
+ 13: linux-x86_64
51
+ m:
52
+ - 1: train/global_step
53
+ 6:
54
+ - 3
55
+ - 1: train/loss
56
+ 5: 1
57
+ 6:
58
+ - 1
59
+ - 1: train/learning_rate
60
+ 5: 1
61
+ 6:
62
+ - 1
63
+ - 1: train/epoch
64
+ 5: 1
65
+ 6:
66
+ - 1
67
+ - 1: eval/loss
68
+ 5: 1
69
+ 6:
70
+ - 1
71
+ - 1: eval/runtime
72
+ 5: 1
73
+ 6:
74
+ - 1
75
+ - 1: eval/samples_per_second
76
+ 5: 1
77
+ 6:
78
+ - 1
79
+ - 1: eval/steps_per_second
80
+ 5: 1
81
+ 6:
82
+ - 1
83
+ vocab_size:
84
+ desc: null
85
+ value: 32000
86
+ max_position_embeddings:
87
+ desc: null
88
+ value: 32768
89
+ hidden_size:
90
+ desc: null
91
+ value: 4096
92
+ intermediate_size:
93
+ desc: null
94
+ value: 14336
95
+ num_hidden_layers:
96
+ desc: null
97
+ value: 6
98
+ num_attention_heads:
99
+ desc: null
100
+ value: 32
101
+ sliding_window:
102
+ desc: null
103
+ value: 4096
104
+ num_key_value_heads:
105
+ desc: null
106
+ value: 8
107
+ hidden_act:
108
+ desc: null
109
+ value: silu
110
+ initializer_range:
111
+ desc: null
112
+ value: 0.02
113
+ rms_norm_eps:
114
+ desc: null
115
+ value: 1.0e-05
116
+ use_cache:
117
+ desc: null
118
+ value: false
119
+ rope_theta:
120
+ desc: null
121
+ value: 10000.0
122
+ attention_dropout:
123
+ desc: null
124
+ value: 0.0
125
+ return_dict:
126
+ desc: null
127
+ value: true
128
+ output_hidden_states:
129
+ desc: null
130
+ value: false
131
+ output_attentions:
132
+ desc: null
133
+ value: false
134
+ torchscript:
135
+ desc: null
136
+ value: false
137
+ torch_dtype:
138
+ desc: null
139
+ value: bfloat16
140
+ use_bfloat16:
141
+ desc: null
142
+ value: false
143
+ tf_legacy_loss:
144
+ desc: null
145
+ value: false
146
+ pruned_heads:
147
+ desc: null
148
+ value: {}
149
+ tie_word_embeddings:
150
+ desc: null
151
+ value: false
152
+ is_encoder_decoder:
153
+ desc: null
154
+ value: false
155
+ is_decoder:
156
+ desc: null
157
+ value: false
158
+ cross_attention_hidden_size:
159
+ desc: null
160
+ value: null
161
+ add_cross_attention:
162
+ desc: null
163
+ value: false
164
+ tie_encoder_decoder:
165
+ desc: null
166
+ value: false
167
+ max_length:
168
+ desc: null
169
+ value: 20
170
+ min_length:
171
+ desc: null
172
+ value: 0
173
+ do_sample:
174
+ desc: null
175
+ value: false
176
+ early_stopping:
177
+ desc: null
178
+ value: false
179
+ num_beams:
180
+ desc: null
181
+ value: 1
182
+ num_beam_groups:
183
+ desc: null
184
+ value: 1
185
+ diversity_penalty:
186
+ desc: null
187
+ value: 0.0
188
+ temperature:
189
+ desc: null
190
+ value: 1.0
191
+ top_k:
192
+ desc: null
193
+ value: 50
194
+ top_p:
195
+ desc: null
196
+ value: 1.0
197
+ typical_p:
198
+ desc: null
199
+ value: 1.0
200
+ repetition_penalty:
201
+ desc: null
202
+ value: 1.0
203
+ length_penalty:
204
+ desc: null
205
+ value: 1.0
206
+ no_repeat_ngram_size:
207
+ desc: null
208
+ value: 0
209
+ encoder_no_repeat_ngram_size:
210
+ desc: null
211
+ value: 0
212
+ bad_words_ids:
213
+ desc: null
214
+ value: null
215
+ num_return_sequences:
216
+ desc: null
217
+ value: 1
218
+ chunk_size_feed_forward:
219
+ desc: null
220
+ value: 0
221
+ output_scores:
222
+ desc: null
223
+ value: false
224
+ return_dict_in_generate:
225
+ desc: null
226
+ value: false
227
+ forced_bos_token_id:
228
+ desc: null
229
+ value: null
230
+ forced_eos_token_id:
231
+ desc: null
232
+ value: null
233
+ remove_invalid_values:
234
+ desc: null
235
+ value: false
236
+ exponential_decay_length_penalty:
237
+ desc: null
238
+ value: null
239
+ suppress_tokens:
240
+ desc: null
241
+ value: null
242
+ begin_suppress_tokens:
243
+ desc: null
244
+ value: null
245
+ architectures:
246
+ desc: null
247
+ value:
248
+ - MistralForCausalLM
249
+ finetuning_task:
250
+ desc: null
251
+ value: null
252
+ id2label:
253
+ desc: null
254
+ value:
255
+ '0': LABEL_0
256
+ '1': LABEL_1
257
+ label2id:
258
+ desc: null
259
+ value:
260
+ LABEL_0: 0
261
+ LABEL_1: 1
262
+ tokenizer_class:
263
+ desc: null
264
+ value: null
265
+ prefix:
266
+ desc: null
267
+ value: null
268
+ bos_token_id:
269
+ desc: null
270
+ value: 1
271
+ pad_token_id:
272
+ desc: null
273
+ value: null
274
+ eos_token_id:
275
+ desc: null
276
+ value: 2
277
+ sep_token_id:
278
+ desc: null
279
+ value: null
280
+ decoder_start_token_id:
281
+ desc: null
282
+ value: null
283
+ task_specific_params:
284
+ desc: null
285
+ value: null
286
+ problem_type:
287
+ desc: null
288
+ value: null
289
+ _name_or_path:
290
+ desc: null
291
+ value: sanchit-gandhi/Mistral-7B-v0.1-6-layer
292
+ transformers_version:
293
+ desc: null
294
+ value: 4.36.2
295
+ model_type:
296
+ desc: null
297
+ value: mistral
298
+ output_dir:
299
+ desc: null
300
+ value: ./
301
+ overwrite_output_dir:
302
+ desc: null
303
+ value: true
304
+ do_train:
305
+ desc: null
306
+ value: false
307
+ do_eval:
308
+ desc: null
309
+ value: true
310
+ do_predict:
311
+ desc: null
312
+ value: false
313
+ evaluation_strategy:
314
+ desc: null
315
+ value: epoch
316
+ prediction_loss_only:
317
+ desc: null
318
+ value: false
319
+ per_device_train_batch_size:
320
+ desc: null
321
+ value: 64
322
+ per_device_eval_batch_size:
323
+ desc: null
324
+ value: 32
325
+ per_gpu_train_batch_size:
326
+ desc: null
327
+ value: null
328
+ per_gpu_eval_batch_size:
329
+ desc: null
330
+ value: null
331
+ gradient_accumulation_steps:
332
+ desc: null
333
+ value: 1
334
+ eval_accumulation_steps:
335
+ desc: null
336
+ value: null
337
+ eval_delay:
338
+ desc: null
339
+ value: 0
340
+ learning_rate:
341
+ desc: null
342
+ value: 0.0003
343
+ weight_decay:
344
+ desc: null
345
+ value: 0.0
346
+ adam_beta1:
347
+ desc: null
348
+ value: 0.9
349
+ adam_beta2:
350
+ desc: null
351
+ value: 0.999
352
+ adam_epsilon:
353
+ desc: null
354
+ value: 1.0e-08
355
+ max_grad_norm:
356
+ desc: null
357
+ value: 1.0
358
+ num_train_epochs:
359
+ desc: null
360
+ value: 5
361
+ max_steps:
362
+ desc: null
363
+ value: -1
364
+ lr_scheduler_type:
365
+ desc: null
366
+ value: cosine
367
+ lr_scheduler_kwargs:
368
+ desc: null
369
+ value: {}
370
+ warmup_ratio:
371
+ desc: null
372
+ value: 0.1
373
+ warmup_steps:
374
+ desc: null
375
+ value: 0
376
+ log_level:
377
+ desc: null
378
+ value: info
379
+ log_level_replica:
380
+ desc: null
381
+ value: warning
382
+ log_on_each_node:
383
+ desc: null
384
+ value: true
385
+ logging_dir:
386
+ desc: null
387
+ value: ./runs/Feb01_17-58-13_ip-26-0-165-24
388
+ logging_strategy:
389
+ desc: null
390
+ value: steps
391
+ logging_first_step:
392
+ desc: null
393
+ value: true
394
+ logging_steps:
395
+ desc: null
396
+ value: 10
397
+ logging_nan_inf_filter:
398
+ desc: null
399
+ value: true
400
+ save_strategy:
401
+ desc: null
402
+ value: epoch
403
+ save_steps:
404
+ desc: null
405
+ value: 500
406
+ save_total_limit:
407
+ desc: null
408
+ value: 1
409
+ save_safetensors:
410
+ desc: null
411
+ value: true
412
+ save_on_each_node:
413
+ desc: null
414
+ value: false
415
+ save_only_model:
416
+ desc: null
417
+ value: false
418
+ no_cuda:
419
+ desc: null
420
+ value: false
421
+ use_cpu:
422
+ desc: null
423
+ value: false
424
+ use_mps_device:
425
+ desc: null
426
+ value: false
427
+ seed:
428
+ desc: null
429
+ value: 42
430
+ data_seed:
431
+ desc: null
432
+ value: null
433
+ jit_mode_eval:
434
+ desc: null
435
+ value: false
436
+ use_ipex:
437
+ desc: null
438
+ value: false
439
+ bf16:
440
+ desc: null
441
+ value: true
442
+ fp16:
443
+ desc: null
444
+ value: false
445
+ fp16_opt_level:
446
+ desc: null
447
+ value: O1
448
+ half_precision_backend:
449
+ desc: null
450
+ value: auto
451
+ bf16_full_eval:
452
+ desc: null
453
+ value: false
454
+ fp16_full_eval:
455
+ desc: null
456
+ value: false
457
+ tf32:
458
+ desc: null
459
+ value: null
460
+ local_rank:
461
+ desc: null
462
+ value: 0
463
+ ddp_backend:
464
+ desc: null
465
+ value: null
466
+ tpu_num_cores:
467
+ desc: null
468
+ value: null
469
+ tpu_metrics_debug:
470
+ desc: null
471
+ value: false
472
+ debug:
473
+ desc: null
474
+ value: []
475
+ dataloader_drop_last:
476
+ desc: null
477
+ value: false
478
+ eval_steps:
479
+ desc: null
480
+ value: null
481
+ dataloader_num_workers:
482
+ desc: null
483
+ value: 0
484
+ past_index:
485
+ desc: null
486
+ value: -1
487
+ run_name:
488
+ desc: null
489
+ value: ./
490
+ disable_tqdm:
491
+ desc: null
492
+ value: false
493
+ remove_unused_columns:
494
+ desc: null
495
+ value: true
496
+ label_names:
497
+ desc: null
498
+ value: null
499
+ load_best_model_at_end:
500
+ desc: null
501
+ value: false
502
+ metric_for_best_model:
503
+ desc: null
504
+ value: null
505
+ greater_is_better:
506
+ desc: null
507
+ value: null
508
+ ignore_data_skip:
509
+ desc: null
510
+ value: false
511
+ fsdp:
512
+ desc: null
513
+ value: []
514
+ fsdp_min_num_params:
515
+ desc: null
516
+ value: 0
517
+ fsdp_config:
518
+ desc: null
519
+ value:
520
+ min_num_params: 0
521
+ xla: false
522
+ xla_fsdp_grad_ckpt: false
523
+ fsdp_transformer_layer_cls_to_wrap:
524
+ desc: null
525
+ value: null
526
+ deepspeed:
527
+ desc: null
528
+ value: null
529
+ label_smoothing_factor:
530
+ desc: null
531
+ value: 0.0
532
+ optim:
533
+ desc: null
534
+ value: adamw_torch
535
+ optim_args:
536
+ desc: null
537
+ value: null
538
+ adafactor:
539
+ desc: null
540
+ value: false
541
+ group_by_length:
542
+ desc: null
543
+ value: false
544
+ length_column_name:
545
+ desc: null
546
+ value: length
547
+ report_to:
548
+ desc: null
549
+ value:
550
+ - tensorboard
551
+ - wandb
552
+ ddp_find_unused_parameters:
553
+ desc: null
554
+ value: null
555
+ ddp_bucket_cap_mb:
556
+ desc: null
557
+ value: null
558
+ ddp_broadcast_buffers:
559
+ desc: null
560
+ value: null
561
+ dataloader_pin_memory:
562
+ desc: null
563
+ value: true
564
+ dataloader_persistent_workers:
565
+ desc: null
566
+ value: false
567
+ skip_memory_metrics:
568
+ desc: null
569
+ value: true
570
+ use_legacy_prediction_loop:
571
+ desc: null
572
+ value: false
573
+ push_to_hub:
574
+ desc: null
575
+ value: true
576
+ resume_from_checkpoint:
577
+ desc: null
578
+ value: null
579
+ hub_model_id:
580
+ desc: null
581
+ value: null
582
+ hub_strategy:
583
+ desc: null
584
+ value: every_save
585
+ hub_token:
586
+ desc: null
587
+ value: <HUB_TOKEN>
588
+ hub_private_repo:
589
+ desc: null
590
+ value: false
591
+ hub_always_push:
592
+ desc: null
593
+ value: false
594
+ gradient_checkpointing:
595
+ desc: null
596
+ value: true
597
+ gradient_checkpointing_kwargs:
598
+ desc: null
599
+ value:
600
+ use_reentrant: false
601
+ include_inputs_for_metrics:
602
+ desc: null
603
+ value: false
604
+ fp16_backend:
605
+ desc: null
606
+ value: auto
607
+ push_to_hub_model_id:
608
+ desc: null
609
+ value: null
610
+ push_to_hub_organization:
611
+ desc: null
612
+ value: null
613
+ push_to_hub_token:
614
+ desc: null
615
+ value: <PUSH_TO_HUB_TOKEN>
616
+ mp_parameters:
617
+ desc: null
618
+ value: ''
619
+ auto_find_batch_size:
620
+ desc: null
621
+ value: false
622
+ full_determinism:
623
+ desc: null
624
+ value: false
625
+ torchdynamo:
626
+ desc: null
627
+ value: null
628
+ ray_scope:
629
+ desc: null
630
+ value: last
631
+ ddp_timeout:
632
+ desc: null
633
+ value: 1800
634
+ torch_compile:
635
+ desc: null
636
+ value: false
637
+ torch_compile_backend:
638
+ desc: null
639
+ value: null
640
+ torch_compile_mode:
641
+ desc: null
642
+ value: null
643
+ dispatch_batches:
644
+ desc: null
645
+ value: null
646
+ split_batches:
647
+ desc: null
648
+ value: false
649
+ include_tokens_per_second:
650
+ desc: null
651
+ value: false
652
+ include_num_input_tokens_seen:
653
+ desc: null
654
+ value: false
655
+ neftune_noise_alpha:
656
+ desc: null
657
+ value: null
658
+ max_seq_length:
659
+ desc: null
660
+ value: 2048
wandb/run-20240201_175850-i93q0p12/files/output.log ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ 0%| | 0/1365 [00:00<?, ?it/s][WARNING|logging.py:314] 2024-02-01 17:58:55,470 >> You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
3
+ 0%| | 1/1365 [00:11<4:10:37, 11.02s/it]
4
+ [2024-02-01 17:59:06,373] [WARNING] [stage3.py:1949:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+ 1%|▌ | 10/1365 [00:43<1:23:26, 3.70s/it]
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+ 1%|█▏ | 20/1365 [01:19<1:20:49, 3.61s/it]
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+ 2%|█▋ | 30/1365 [01:55<1:20:11, 3.60s/it]
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+ 3%|██▎ | 40/1365 [02:31<1:19:28, 3.60s/it]
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+ 4%|██▉ | 50/1365 [03:07<1:19:00, 3.61s/it]
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+ 4%|███▍ | 60/1365 [03:43<1:18:34, 3.61s/it]
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+ 5%|████ | 70/1365 [04:19<1:17:55, 3.61s/it]
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+ 6%|████▋ | 80/1365 [04:55<1:17:12, 3.61s/it]
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+ 7%|█████▏ | 90/1365 [05:31<1:16:32, 3.60s/it]
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+ 7%|█████▋ | 100/1365 [06:07<1:16:06, 3.61s/it]
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+ 8%|██████▎ | 110/1365 [06:44<1:15:24, 3.60s/it]
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+ 9%|██████▊ | 120/1365 [07:20<1:14:48, 3.61s/it]
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+ 10%|███████▍ | 130/1365 [07:56<1:14:12, 3.61s/it]
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+ 10%|████████ | 140/1365 [08:32<1:13:29, 3.60s/it]
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+ 11%|████████▌ | 150/1365 [09:07<1:12:25, 3.58s/it]
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+ 12%|█████████▏ | 160/1365 [09:43<1:12:00, 3.59s/it]
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+ 12%|█████████▋ | 170/1365 [10:20<1:12:29, 3.64s/it]
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+ 13%|██████████▏ | 179/1365 [10:52<1:11:26, 3.61s/it]
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+ 14%|██████████▊ | 189/1365 [11:28<1:10:32, 3.60s/it]
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+ 15%|███████████▎ | 199/1365 [12:04<1:09:55, 3.60s/it]
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+ 15%|████████████ | 210/1365 [12:44<1:09:11, 3.59s/it]
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+ 16%|████████████▌ | 220/1365 [13:20<1:08:33, 3.59s/it]
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+ 17%|█████████████▏ | 230/1365 [13:56<1:08:01, 3.60s/it]
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+ 18%|█████████████▋ | 240/1365 [14:32<1:07:15, 3.59s/it]
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+ 18%|██████████████▎ | 250/1365 [15:07<1:06:48, 3.59s/it]
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+ 19%|██████████████▊ | 260/1365 [15:44<1:06:51, 3.63s/it]
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+
300
+ 20%|███████████████▍ | 270/1365 [16:20<1:06:13, 3.63s/it]
301
+
302
+
303
+ 20%|███████████████▌ | 273/1365 [16:31<1:05:09, 3.58s/it][INFO|trainer.py:3166] 2024-02-01 18:15:26,429 >> ***** Running Evaluation *****
304
+ [INFO|trainer.py:3168] 2024-02-01 18:15:26,429 >> Num examples = 15431
305
+ [INFO|trainer.py:3171] 2024-02-01 18:15:26,429 >> Batch size = 32
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+ 95%|██████████████████████████████████████████████████████████████████████████████▉ | 58/61 [00:28<00:01, 1.99it/s]
322
+ 20%|███████████████▌ | 273/1365 [17:01<1:05:09, 3.58s/it][INFO|trainer.py:2889] 2024-02-01 18:15:58,319 >> Saving model checkpoint to ./tmp-checkpoint-273
323
+ [INFO|configuration_utils.py:483] 2024-02-01 18:15:58,323 >> Configuration saved in ./tmp-checkpoint-273/config.json
324
+ [INFO|configuration_utils.py:594] 2024-02-01 18:15:58,326 >> Configuration saved in ./tmp-checkpoint-273/generation_config.json
325
+ [INFO|modeling_utils.py:2382] 2024-02-01 18:16:01,522 >> Model weights saved in ./tmp-checkpoint-273/pytorch_model.bin
326
+ [INFO|tokenization_utils_base.py:2432] 2024-02-01 18:16:01,541 >> tokenizer config file saved in ./tmp-checkpoint-273/tokenizer_config.json
327
+ [INFO|tokenization_utils_base.py:2441] 2024-02-01 18:16:01,543 >> Special tokens file saved in ./tmp-checkpoint-273/special_tokens_map.json
328
+ /fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
329
+ warnings.warn(
330
+ [2024-02-01 18:16:01,626] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step273 is about to be saved!
331
+ [2024-02-01 18:16:01,783] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-273/global_step273/zero_pp_rank_0_mp_rank_00_model_states.pt
332
+ [2024-02-01 18:16:01,784] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-273/global_step273/zero_pp_rank_0_mp_rank_00_model_states.pt...
333
+ [2024-02-01 18:16:01,787] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-273/global_step273/zero_pp_rank_0_mp_rank_00_model_states.pt.
334
+ [2024-02-01 18:16:01,792] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-273/global_step273/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
335
+ [2024-02-01 18:16:05,650] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-273/global_step273/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
336
+ [2024-02-01 18:16:05,658] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-273/global_step273/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
337
+ [2024-02-01 18:16:05,936] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step273 is ready now!
338
+ [INFO|tokenization_utils_base.py:2432] 2024-02-01 18:16:08,421 >> tokenizer config file saved in ./tokenizer_config.json
339
+ [INFO|tokenization_utils_base.py:2441] 2024-02-01 18:16:08,424 >> Special tokens file saved in ./special_tokens_map.json
340
+
341
+
342
+
343
+
344
+
345
+
346
+ 21%|████████████████ | 280/1365 [17:38<1:31:41, 5.07s/it]
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+ 21%|████████████████▌ | 290/1365 [18:14<1:04:59, 3.63s/it]
358
+
wandb/run-20240201_175850-i93q0p12/files/requirements.txt ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ accelerate==0.23.0
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ alignment-handbook==0.4.0.dev0
6
+ annotated-types==0.6.0
7
+ appdirs==1.4.4
8
+ astunparse==1.6.3
9
+ attrs==23.1.0
10
+ audioread==3.0.1
11
+ bitsandbytes==0.41.2.post2
12
+ brotli==1.0.9
13
+ cachetools==5.3.2
14
+ certifi==2023.11.17
15
+ cffi==1.16.0
16
+ chardet==5.2.0
17
+ charset-normalizer==2.0.4
18
+ click==8.1.7
19
+ cryptography==41.0.7
20
+ datasets==2.14.6
21
+ decorator==5.1.1
22
+ deepspeed==0.12.2
23
+ dill==0.3.7
24
+ docker-pycreds==0.4.0
25
+ docstring-parser==0.15
26
+ einops==0.7.0
27
+ evaluate==0.4.0
28
+ filelock==3.13.1
29
+ flash-attn==2.5.2
30
+ flatbuffers==23.5.26
31
+ frozenlist==1.4.1
32
+ fsspec==2023.10.0
33
+ gast==0.5.4
34
+ gitdb==4.0.11
35
+ gitpython==3.1.40
36
+ gmpy2==2.1.2
37
+ google-auth-oauthlib==1.2.0
38
+ google-auth==2.26.1
39
+ google-pasta==0.2.0
40
+ grpcio==1.60.0
41
+ h5py==3.10.0
42
+ hf-transfer==0.1.5
43
+ hjson==3.1.0
44
+ huggingface-hub==0.20.1
45
+ idna==3.4
46
+ jinja2==3.1.2
47
+ jiwer==3.0.3
48
+ joblib==1.3.2
49
+ keras==2.15.0
50
+ lazy-loader==0.3
51
+ libclang==16.0.6
52
+ librosa==0.10.1
53
+ llvmlite==0.41.1
54
+ markdown-it-py==3.0.0
55
+ markdown==3.5.1
56
+ markupsafe==2.1.1
57
+ mdurl==0.1.2
58
+ mkl-fft==1.3.8
59
+ mkl-random==1.2.4
60
+ mkl-service==2.4.0
61
+ ml-dtypes==0.2.0
62
+ mpmath==1.3.0
63
+ msgpack==1.0.7
64
+ multidict==6.0.4
65
+ multiprocess==0.70.15
66
+ networkx==3.1
67
+ ninja==1.11.1.1
68
+ nltk==3.8.1
69
+ numba==0.58.1
70
+ numpy==1.26.2
71
+ oauthlib==3.2.2
72
+ opt-einsum==3.3.0
73
+ packaging==23.2
74
+ pandas==2.1.4
75
+ peft==0.7.1
76
+ pillow==10.2.0
77
+ pip==23.3.2
78
+ platformdirs==4.1.0
79
+ pooch==1.8.0
80
+ protobuf==3.20.2
81
+ psutil==5.9.7
82
+ py-cpuinfo==9.0.0
83
+ pyarrow-hotfix==0.6
84
+ pyarrow==14.0.2
85
+ pyasn1-modules==0.3.0
86
+ pyasn1==0.5.1
87
+ pycparser==2.21
88
+ pydantic-core==2.16.1
89
+ pydantic==2.6.0
90
+ pygments==2.17.2
91
+ pynvml==11.5.0
92
+ pyopenssl==23.2.0
93
+ pysocks==1.7.1
94
+ python-dateutil==2.8.2
95
+ pytz==2023.3.post1
96
+ pyyaml==6.0.1
97
+ rapidfuzz==3.6.1
98
+ regex==2023.12.25
99
+ requests-oauthlib==1.3.1
100
+ requests==2.31.0
101
+ responses==0.18.0
102
+ rich==13.7.0
103
+ rsa==4.9
104
+ safetensors==0.4.1
105
+ scikit-learn==1.3.2
106
+ scipy==1.11.4
107
+ sentencepiece==0.1.99
108
+ sentry-sdk==1.39.1
109
+ setproctitle==1.3.3
110
+ setuptools==68.2.2
111
+ shtab==1.6.5
112
+ six==1.16.0
113
+ smmap==5.0.1
114
+ soundfile==0.12.1
115
+ soxr==0.3.7
116
+ sympy==1.12
117
+ tensorboard-data-server==0.7.2
118
+ tensorboard==2.15.1
119
+ tensorflow-cpu==2.15.0.post1
120
+ tensorflow-estimator==2.15.0
121
+ tensorflow-io-gcs-filesystem==0.35.0
122
+ termcolor==2.4.0
123
+ threadpoolctl==3.2.0
124
+ tokenizers==0.15.0
125
+ torch==2.1.2
126
+ torchaudio==2.1.2
127
+ torchvision==0.16.2
128
+ tqdm==4.66.1
129
+ transformers==4.36.2
130
+ triton==2.1.0
131
+ trl==0.7.7
132
+ typing-extensions==4.7.1
133
+ tyro==0.7.0
134
+ tzdata==2023.3
135
+ urllib3==1.26.18
136
+ wandb==0.16.1
137
+ werkzeug==3.0.1
138
+ wheel==0.41.2
139
+ wrapt==1.14.1
140
+ xxhash==3.4.1
141
+ yarl==1.9.4
wandb/run-20240201_175850-i93q0p12/files/wandb-metadata.json ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
3
+ "python": "3.11.5",
4
+ "heartbeatAt": "2024-02-01T17:58:50.615903",
5
+ "startedAt": "2024-02-01T17:58:50.064651",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "config_full.yaml"
10
+ ],
11
+ "state": "running",
12
+ "program": "/fsx/sanchit/distil-zephyr-1.5b-ssft/run_sft.py",
13
+ "codePathLocal": "run_sft.py",
14
+ "codePath": "run_sft.py",
15
+ "git": {
16
+ "remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft",
17
+ "commit": "79a4ae874a71e67016ded927e7d23351e5c7dab8"
18
+ },
19
+ "email": null,
20
+ "root": "/fsx/sanchit/distil-zephyr-1.5b-ssft",
21
+ "host": "ip-26-0-165-24",
22
+ "username": "sanchit",
23
+ "executable": "/fsx/sanchit/miniconda3/envs/venv/bin/python",
24
+ "cpu_count": 96,
25
+ "cpu_count_logical": 96,
26
+ "cpu_freq": {
27
+ "current": 2728.6971041666666,
28
+ "min": 0.0,
29
+ "max": 0.0
30
+ },
31
+ "cpu_freq_per_core": [
32
+ {
33
+ "current": 2650.0,
34
+ "min": 0.0,
35
+ "max": 0.0
36
+ },
37
+ {
38
+ "current": 2650.0,
39
+ "min": 0.0,
40
+ "max": 0.0
41
+ },
42
+ {
43
+ "current": 2650.0,
44
+ "min": 0.0,
45
+ "max": 0.0
46
+ },
47
+ {
48
+ "current": 2650.0,
49
+ "min": 0.0,
50
+ "max": 0.0
51
+ },
52
+ {
53
+ "current": 2650.0,
54
+ "min": 0.0,
55
+ "max": 0.0
56
+ },
57
+ {
58
+ "current": 2650.0,
59
+ "min": 0.0,
60
+ "max": 0.0
61
+ },
62
+ {
63
+ "current": 2650.0,
64
+ "min": 0.0,
65
+ "max": 0.0
66
+ },
67
+ {
68
+ "current": 3589.018,
69
+ "min": 0.0,
70
+ "max": 0.0
71
+ },
72
+ {
73
+ "current": 2650.0,
74
+ "min": 0.0,
75
+ "max": 0.0
76
+ },
77
+ {
78
+ "current": 2650.0,
79
+ "min": 0.0,
80
+ "max": 0.0
81
+ },
82
+ {
83
+ "current": 3587.597,
84
+ "min": 0.0,
85
+ "max": 0.0
86
+ },
87
+ {
88
+ "current": 2650.0,
89
+ "min": 0.0,
90
+ "max": 0.0
91
+ },
92
+ {
93
+ "current": 3596.958,
94
+ "min": 0.0,
95
+ "max": 0.0
96
+ },
97
+ {
98
+ "current": 2650.0,
99
+ "min": 0.0,
100
+ "max": 0.0
101
+ },
102
+ {
103
+ "current": 2650.0,
104
+ "min": 0.0,
105
+ "max": 0.0
106
+ },
107
+ {
108
+ "current": 2650.0,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 3597.83,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2650.0,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 3595.048,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2650.0,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2650.0,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2650.0,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2650.0,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 3591.991,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 3598.381,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 3598.099,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2650.0,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2650.0,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2650.0,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2650.0,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2650.0,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2650.0,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2650.0,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2650.0,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ },
202
+ {
203
+ "current": 2650.0,
204
+ "min": 0.0,
205
+ "max": 0.0
206
+ },
207
+ {
208
+ "current": 2650.0,
209
+ "min": 0.0,
210
+ "max": 0.0
211
+ },
212
+ {
213
+ "current": 2650.0,
214
+ "min": 0.0,
215
+ "max": 0.0
216
+ },
217
+ {
218
+ "current": 2650.0,
219
+ "min": 0.0,
220
+ "max": 0.0
221
+ },
222
+ {
223
+ "current": 2650.0,
224
+ "min": 0.0,
225
+ "max": 0.0
226
+ },
227
+ {
228
+ "current": 2650.0,
229
+ "min": 0.0,
230
+ "max": 0.0
231
+ },
232
+ {
233
+ "current": 2650.0,
234
+ "min": 0.0,
235
+ "max": 0.0
236
+ },
237
+ {
238
+ "current": 2650.0,
239
+ "min": 0.0,
240
+ "max": 0.0
241
+ },
242
+ {
243
+ "current": 2650.0,
244
+ "min": 0.0,
245
+ "max": 0.0
246
+ },
247
+ {
248
+ "current": 2650.0,
249
+ "min": 0.0,
250
+ "max": 0.0
251
+ },
252
+ {
253
+ "current": 2650.0,
254
+ "min": 0.0,
255
+ "max": 0.0
256
+ },
257
+ {
258
+ "current": 2650.0,
259
+ "min": 0.0,
260
+ "max": 0.0
261
+ },
262
+ {
263
+ "current": 2650.0,
264
+ "min": 0.0,
265
+ "max": 0.0
266
+ },
267
+ {
268
+ "current": 2650.0,
269
+ "min": 0.0,
270
+ "max": 0.0
271
+ },
272
+ {
273
+ "current": 2650.0,
274
+ "min": 0.0,
275
+ "max": 0.0
276
+ },
277
+ {
278
+ "current": 2650.0,
279
+ "min": 0.0,
280
+ "max": 0.0
281
+ },
282
+ {
283
+ "current": 2650.0,
284
+ "min": 0.0,
285
+ "max": 0.0
286
+ },
287
+ {
288
+ "current": 2650.0,
289
+ "min": 0.0,
290
+ "max": 0.0
291
+ },
292
+ {
293
+ "current": 2650.0,
294
+ "min": 0.0,
295
+ "max": 0.0
296
+ },
297
+ {
298
+ "current": 2650.0,
299
+ "min": 0.0,
300
+ "max": 0.0
301
+ },
302
+ {
303
+ "current": 2650.0,
304
+ "min": 0.0,
305
+ "max": 0.0
306
+ },
307
+ {
308
+ "current": 2650.0,
309
+ "min": 0.0,
310
+ "max": 0.0
311
+ },
312
+ {
313
+ "current": 2650.0,
314
+ "min": 0.0,
315
+ "max": 0.0
316
+ },
317
+ {
318
+ "current": 2650.0,
319
+ "min": 0.0,
320
+ "max": 0.0
321
+ },
322
+ {
323
+ "current": 2650.0,
324
+ "min": 0.0,
325
+ "max": 0.0
326
+ },
327
+ {
328
+ "current": 2650.0,
329
+ "min": 0.0,
330
+ "max": 0.0
331
+ },
332
+ {
333
+ "current": 2650.0,
334
+ "min": 0.0,
335
+ "max": 0.0
336
+ },
337
+ {
338
+ "current": 2650.0,
339
+ "min": 0.0,
340
+ "max": 0.0
341
+ },
342
+ {
343
+ "current": 2650.0,
344
+ "min": 0.0,
345
+ "max": 0.0
346
+ },
347
+ {
348
+ "current": 2650.0,
349
+ "min": 0.0,
350
+ "max": 0.0
351
+ },
352
+ {
353
+ "current": 2650.0,
354
+ "min": 0.0,
355
+ "max": 0.0
356
+ },
357
+ {
358
+ "current": 2650.0,
359
+ "min": 0.0,
360
+ "max": 0.0
361
+ },
362
+ {
363
+ "current": 2650.0,
364
+ "min": 0.0,
365
+ "max": 0.0
366
+ },
367
+ {
368
+ "current": 2650.0,
369
+ "min": 0.0,
370
+ "max": 0.0
371
+ },
372
+ {
373
+ "current": 2650.0,
374
+ "min": 0.0,
375
+ "max": 0.0
376
+ },
377
+ {
378
+ "current": 2650.0,
379
+ "min": 0.0,
380
+ "max": 0.0
381
+ },
382
+ {
383
+ "current": 2650.0,
384
+ "min": 0.0,
385
+ "max": 0.0
386
+ },
387
+ {
388
+ "current": 2650.0,
389
+ "min": 0.0,
390
+ "max": 0.0
391
+ },
392
+ {
393
+ "current": 2650.0,
394
+ "min": 0.0,
395
+ "max": 0.0
396
+ },
397
+ {
398
+ "current": 2650.0,
399
+ "min": 0.0,
400
+ "max": 0.0
401
+ },
402
+ {
403
+ "current": 2650.0,
404
+ "min": 0.0,
405
+ "max": 0.0
406
+ },
407
+ {
408
+ "current": 2650.0,
409
+ "min": 0.0,
410
+ "max": 0.0
411
+ },
412
+ {
413
+ "current": 2650.0,
414
+ "min": 0.0,
415
+ "max": 0.0
416
+ },
417
+ {
418
+ "current": 2650.0,
419
+ "min": 0.0,
420
+ "max": 0.0
421
+ },
422
+ {
423
+ "current": 2650.0,
424
+ "min": 0.0,
425
+ "max": 0.0
426
+ },
427
+ {
428
+ "current": 2650.0,
429
+ "min": 0.0,
430
+ "max": 0.0
431
+ },
432
+ {
433
+ "current": 2650.0,
434
+ "min": 0.0,
435
+ "max": 0.0
436
+ },
437
+ {
438
+ "current": 2650.0,
439
+ "min": 0.0,
440
+ "max": 0.0
441
+ },
442
+ {
443
+ "current": 2650.0,
444
+ "min": 0.0,
445
+ "max": 0.0
446
+ },
447
+ {
448
+ "current": 2650.0,
449
+ "min": 0.0,
450
+ "max": 0.0
451
+ },
452
+ {
453
+ "current": 2650.0,
454
+ "min": 0.0,
455
+ "max": 0.0
456
+ },
457
+ {
458
+ "current": 2650.0,
459
+ "min": 0.0,
460
+ "max": 0.0
461
+ },
462
+ {
463
+ "current": 2650.0,
464
+ "min": 0.0,
465
+ "max": 0.0
466
+ },
467
+ {
468
+ "current": 2650.0,
469
+ "min": 0.0,
470
+ "max": 0.0
471
+ },
472
+ {
473
+ "current": 2650.0,
474
+ "min": 0.0,
475
+ "max": 0.0
476
+ },
477
+ {
478
+ "current": 2650.0,
479
+ "min": 0.0,
480
+ "max": 0.0
481
+ },
482
+ {
483
+ "current": 2650.0,
484
+ "min": 0.0,
485
+ "max": 0.0
486
+ },
487
+ {
488
+ "current": 2650.0,
489
+ "min": 0.0,
490
+ "max": 0.0
491
+ },
492
+ {
493
+ "current": 2650.0,
494
+ "min": 0.0,
495
+ "max": 0.0
496
+ },
497
+ {
498
+ "current": 2650.0,
499
+ "min": 0.0,
500
+ "max": 0.0
501
+ },
502
+ {
503
+ "current": 2650.0,
504
+ "min": 0.0,
505
+ "max": 0.0
506
+ },
507
+ {
508
+ "current": 2650.0,
509
+ "min": 0.0,
510
+ "max": 0.0
511
+ }
512
+ ],
513
+ "disk": {
514
+ "/": {
515
+ "total": 290.7472343444824,
516
+ "used": 57.44965744018555
517
+ }
518
+ },
519
+ "gpu": "NVIDIA H100 80GB HBM3",
520
+ "gpu_count": 8,
521
+ "gpu_devices": [
522
+ {
523
+ "name": "NVIDIA H100 80GB HBM3",
524
+ "memory_total": 85520809984
525
+ },
526
+ {
527
+ "name": "NVIDIA H100 80GB HBM3",
528
+ "memory_total": 85520809984
529
+ },
530
+ {
531
+ "name": "NVIDIA H100 80GB HBM3",
532
+ "memory_total": 85520809984
533
+ },
534
+ {
535
+ "name": "NVIDIA H100 80GB HBM3",
536
+ "memory_total": 85520809984
537
+ },
538
+ {
539
+ "name": "NVIDIA H100 80GB HBM3",
540
+ "memory_total": 85520809984
541
+ },
542
+ {
543
+ "name": "NVIDIA H100 80GB HBM3",
544
+ "memory_total": 85520809984
545
+ },
546
+ {
547
+ "name": "NVIDIA H100 80GB HBM3",
548
+ "memory_total": 85520809984
549
+ },
550
+ {
551
+ "name": "NVIDIA H100 80GB HBM3",
552
+ "memory_total": 85520809984
553
+ }
554
+ ],
555
+ "memory": {
556
+ "total": 1999.9855346679688
557
+ }
558
+ }
wandb/run-20240201_175850-i93q0p12/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 4.6222, "train/learning_rate": 0.00028865524291388006, "train/epoch": 1.06, "train/global_step": 290, "_timestamp": 1706811429.4937887, "_runtime": 1099.4098217487335, "_step": 30, "eval/loss": 4.737916946411133, "eval/runtime": 30.7851, "eval/samples_per_second": 501.248, "eval/steps_per_second": 1.981}
wandb/run-20240201_175850-i93q0p12/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240201_175850-i93q0p12/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-02-01 17:58:50,075 INFO MainThread:239318 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
2
+ 2024-02-01 17:58:50,075 INFO MainThread:239318 [wandb_setup.py:_flush():76] Configure stats pid to 239318
3
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/settings
5
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft/run_sft.py'}
8
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_175850-i93q0p12/logs/debug.log
9
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft/wandb/run-20240201_175850-i93q0p12/logs/debug-internal.log
10
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:init():564] calling init triggers
11
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
12
+ config: {}
13
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:init():614] starting backend
14
+ 2024-02-01 17:58:50,076 INFO MainThread:239318 [wandb_init.py:init():618] setting up manager
15
+ 2024-02-01 17:58:50,079 INFO MainThread:239318 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-02-01 17:58:50,083 INFO MainThread:239318 [wandb_init.py:init():624] backend started and connected
17
+ 2024-02-01 17:58:50,087 INFO MainThread:239318 [wandb_init.py:init():716] updated telemetry
18
+ 2024-02-01 17:58:50,107 INFO MainThread:239318 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
19
+ 2024-02-01 17:58:50,409 INFO MainThread:239318 [wandb_run.py:_on_init():2254] communicating current version
20
+ 2024-02-01 17:58:50,456 INFO MainThread:239318 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.2 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-02-01 17:58:50,456 INFO MainThread:239318 [wandb_init.py:init():800] starting run threads in backend
23
+ 2024-02-01 17:58:55,347 INFO MainThread:239318 [wandb_run.py:_console_start():2233] atexit reg
24
+ 2024-02-01 17:58:55,347 INFO MainThread:239318 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
+ 2024-02-01 17:58:55,348 INFO MainThread:239318 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
+ 2024-02-01 17:58:55,348 INFO MainThread:239318 [wandb_run.py:_redirect():2178] Redirects installed.
27
+ 2024-02-01 17:58:55,348 INFO MainThread:239318 [wandb_init.py:init():841] run started, returning control to user process
28
+ 2024-02-01 17:58:55,349 INFO MainThread:239318 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.36.2', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 64, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Feb01_17-58-13_ip-26-0-165-24', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'max_seq_length': 2048}
wandb/run-20240201_175850-i93q0p12/run-i93q0p12.wandb ADDED
Binary file (295 kB). View file