BrownianNotion commited on
Commit
1533579
·
verified ·
1 Parent(s): aa5c995

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20250408_144457-lfekgvx4/run-lfekgvx4.wandb filter=lfs diff=lfs merge=lfs -text
metrics.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "PPL": 17.111480712890625,
3
+ "arc_challenge": {
4
+ "acc": 0.2158703071672355,
5
+ "acc_stderr": 0.012022975360030667,
6
+ "acc_norm": 0.26621160409556316,
7
+ "acc_norm_stderr": 0.01291577478152322
8
+ },
9
+ "arc_easy": {
10
+ "acc": 0.37836700336700335,
11
+ "acc_stderr": 0.00995157568333195,
12
+ "acc_norm": 0.36195286195286197,
13
+ "acc_norm_stderr": 0.009860991466688493
14
+ },
15
+ "hellaswag": {
16
+ "acc": 0.33389762995419237,
17
+ "acc_stderr": 0.004706398252382465,
18
+ "acc_norm": 0.4028082055367457,
19
+ "acc_norm_stderr": 0.004894604293405655
20
+ },
21
+ "piqa": {
22
+ "acc": 0.6245919477693145,
23
+ "acc_stderr": 0.01129783958977666,
24
+ "acc_norm": 0.6180631120783461,
25
+ "acc_norm_stderr": 0.011335942557505228
26
+ },
27
+ "winogrande": {
28
+ "acc": 0.5240726124704025,
29
+ "acc_stderr": 0.014036189665395132
30
+ },
31
+ "QA Avg": 0.4153599001456296
32
+ }
runs/events.out.tfevents.1744123376.8f7554e9d37a ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77189375954190cbe2609b01f1d22bf4e786ee93e28944f1018d502ce5ee381a
3
+ size 5735
runs/events.out.tfevents.1744123518.8f7554e9d37a ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35366aa6275a06f709994d6198fdaf9a4ebde7d9f29355d03ba91dff398e8a0f
3
+ size 70125
wandb/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:44:57.021153491Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144457-lfekgvx4/logs/debug-core.log"}
2
+ {"time":"2025-04-08T14:44:57.323241702Z","level":"INFO","msg":"created new stream","id":"lfekgvx4"}
3
+ {"time":"2025-04-08T14:44:57.323338343Z","level":"INFO","msg":"stream: started","id":"lfekgvx4"}
4
+ {"time":"2025-04-08T14:44:57.323398396Z","level":"INFO","msg":"writer: Do: started","stream_id":"lfekgvx4"}
5
+ {"time":"2025-04-08T14:44:57.323467054Z","level":"INFO","msg":"handler: started","stream_id":"lfekgvx4"}
6
+ {"time":"2025-04-08T14:44:57.323506598Z","level":"INFO","msg":"sender: started","stream_id":"lfekgvx4"}
7
+ {"time":"2025-04-08T14:44:57.614102027Z","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-04-08T16:04:18.122439099Z","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-04-08T16:04:18.122764017Z","level":"INFO","msg":"Stopped system monitor"}
10
+ {"time":"2025-04-08T16:04:18.897381609Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2025-04-08T16:04:19.123406575Z","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2025-04-08T16:04:19.131020207Z","level":"INFO","msg":"stream: closing","id":"lfekgvx4"}
13
+ {"time":"2025-04-08T16:04:19.131051826Z","level":"INFO","msg":"handler: closed","stream_id":"lfekgvx4"}
14
+ {"time":"2025-04-08T16:04:19.131065982Z","level":"INFO","msg":"writer: Close: closed","stream_id":"lfekgvx4"}
15
+ {"time":"2025-04-08T16:04:19.131087513Z","level":"INFO","msg":"sender: closed","stream_id":"lfekgvx4"}
16
+ {"time":"2025-04-08T16:04:19.131275715Z","level":"INFO","msg":"stream: closed","id":"lfekgvx4"}
wandb/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-08 14:44:57,011 INFO MainThread:8387 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9
2
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_setup.py:_flush():67] Configure stats pid to 8387
3
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings
4
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_setup.py:_flush():67] Loading settings from /workspace/BitDistiller/train/wandb/settings
5
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:setup_run_log_directory():662] Logging user logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144457-lfekgvx4/logs/debug.log
7
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144457-lfekgvx4/logs/debug-internal.log
8
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:init():781] calling init triggers
9
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:init():786] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:init():809] starting backend
12
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:init():813] sending inform_init request
13
+ 2025-04-08 14:44:57,017 INFO MainThread:8387 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-08 14:44:57,017 INFO MainThread:8387 [wandb_init.py:init():823] backend started and connected
15
+ 2025-04-08 14:44:57,020 INFO MainThread:8387 [wandb_init.py:init():915] updated telemetry
16
+ 2025-04-08 14:44:57,211 INFO MainThread:8387 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout
17
+ 2025-04-08 14:44:57,610 INFO MainThread:8387 [wandb_init.py:init():1014] starting run threads in backend
18
+ 2025-04-08 14:44:57,712 INFO MainThread:8387 [wandb_run.py:_console_start():2454] atexit reg
19
+ 2025-04-08 14:44:57,712 INFO MainThread:8387 [wandb_run.py:_redirect():2306] redirect: wrap_raw
20
+ 2025-04-08 14:44:57,712 INFO MainThread:8387 [wandb_run.py:_redirect():2371] Wrapping output streams.
21
+ 2025-04-08 14:44:57,712 INFO MainThread:8387 [wandb_run.py:_redirect():2394] Redirects installed.
22
+ 2025-04-08 14:44:57,714 INFO MainThread:8387 [wandb_init.py:init():1056] run started, returning control to user process
23
+ 2025-04-08 14:45:18,149 INFO MainThread:8387 [wandb_run.py:_config_callback():1327] config_cb None None {'vocab_size': 32001, 'max_position_embeddings': 2048, 'hidden_size': 2048, 'intermediate_size': 5632, 'num_hidden_layers': 22, 'num_attention_heads': 32, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '../models/TinyLlama_v1.1/', 'transformers_version': '4.37.0', 'model_type': 'llama', 'output_dir': './ckpts/tinyllama_v1.1/int2-g128/', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 4.0, 'max_steps': -1, 'lr_scheduler_type': 'constant', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './ckpts/tinyllama_v1.1/int2-g128/runs/', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 40, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 40, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './ckpts/tinyllama_v1.1/int2-g128/', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': 'config/zero.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'cache_dir': None, 'model_max_length': 1024, 'bits': 2, 'q_group_size': 128, 'quant_type': 'int2-asym', 'clip': '../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt', 'train_kd': True, 'kd_tmp': 1, 'kd_loss_type': 'cakld', 'cakld_steps': 10}
24
+ 2025-04-08 16:04:18,120 INFO MainThread:8387 [wandb_run.py:_finish():2189] finishing run DeepFriedNLP/SNLP_BitDistiller/lfekgvx4
25
+ 2025-04-08 16:04:18,121 INFO MainThread:8387 [wandb_run.py:_atexit_cleanup():2419] got exitcode: 0
26
+ 2025-04-08 16:04:18,121 INFO MainThread:8387 [wandb_run.py:_restore():2401] restore
27
+ 2025-04-08 16:04:18,121 INFO MainThread:8387 [wandb_run.py:_restore():2407] restore done
28
+ 2025-04-08 16:04:19,128 INFO MainThread:8387 [wandb_run.py:_footer_history_summary_info():4064] rendering history
29
+ 2025-04-08 16:04:19,129 INFO MainThread:8387 [wandb_run.py:_footer_history_summary_info():4096] rendering summary
30
+ 2025-04-08 16:04:19,130 INFO MainThread:8387 [wandb_run.py:_footer_sync_info():4025] logging synced files
wandb/run-20250408_144213-ss3av20k/files/output.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /workspace/BitDistiller/BitDistillerVenv/lib/python3.9/site-packages/accelerate/accelerator.py:432: FutureWarning: Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches']). Please pass an `accelerate.DataLoaderConfiguration` instead:
2
+ dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
3
+ warnings.warn(
4
+ Using /root/.cache/torch_extensions/py39_cu124 as PyTorch extensions root...
5
+ Creating extension directory /root/.cache/torch_extensions/py39_cu124/cpu_adam...
6
+ Emitting ninja build file /root/.cache/torch_extensions/py39_cu124/cpu_adam/build.ninja...
7
+ Building extension module cpu_adam...
8
+ Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
9
+ Loading extension module cpu_adam...
10
+ Time to load cpu_adam op: 25.886926889419556 seconds
11
+ [2025-04-08 14:42:56,582] [WARNING] [lr_schedules.py:683:get_lr] Attempting to get learning rate from scheduler before it has started
12
+ 0%| | 0/400 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
13
+ /workspace/BitDistiller/BitDistillerVenv/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py:745: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
14
+ return fn(*args, **kwargs)
15
+ 2%|█▎ | 8/400 [01:18<1:01:49, 9.46s/it]
16
+ {'loss': 4117.6992, 'learning_rate': 0.0, 'epoch': 0.01}
17
+ {'loss': 3744.9165, 'learning_rate': 2e-05, 'epoch': 0.02}
18
+ {'loss': 2814.1448, 'learning_rate': 2e-05, 'epoch': 0.03}
19
+ {'loss': 1135.4133, 'learning_rate': 2e-05, 'epoch': 0.04}
20
+ {'loss': 1219.8018, 'learning_rate': 2e-05, 'epoch': 0.05}
21
+ {'loss': 975.1074, 'learning_rate': 2e-05, 'epoch': 0.06}
22
+ {'loss': 575.0048, 'learning_rate': 2e-05, 'epoch': 0.07}
23
+ {'loss': 499.4206, 'learning_rate': 2e-05, 'epoch': 0.08}
wandb/run-20250408_144213-ss3av20k/files/requirements.txt ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==58.1.0
2
+ pip==23.0.1
3
+ wcwidth==0.2.13
4
+ triton==3.2.0
5
+ sqlitedict==2.1.0
6
+ sentencepiece==0.2.0
7
+ pytz==2025.2
8
+ py-cpuinfo==9.0.0
9
+ pure_eval==0.2.3
10
+ ptyprocess==0.7.0
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ hjson==3.1.0
14
+ Fraction==2.2.0
15
+ antlr4-python3-runtime==4.9.3
16
+ zstandard==0.23.0
17
+ zipp==3.21.0
18
+ xxhash==3.5.0
19
+ urllib3==2.3.0
20
+ tzdata==2025.2
21
+ typing_extensions==4.13.1
22
+ traitlets==5.14.3
23
+ tqdm==4.67.1
24
+ tornado==6.4.2
25
+ threadpoolctl==3.6.0
26
+ tcolorpy==0.1.7
27
+ tabulate==0.9.0
28
+ sympy==1.13.1
29
+ smmap==5.0.2
30
+ six==1.17.0
31
+ setproctitle==1.3.5
32
+ safetensors==0.5.3
33
+ regex==2024.11.6
34
+ pyzmq==26.4.0
35
+ PyYAML==6.0.2
36
+ Pygments==2.19.1
37
+ pycountry==24.6.1
38
+ pyarrow==19.0.1
39
+ psutil==7.0.0
40
+ protobuf==5.29.4
41
+ propcache==0.3.1
42
+ prompt_toolkit==3.0.50
43
+ portalocker==3.1.1
44
+ platformdirs==4.3.7
45
+ pexpect==4.9.0
46
+ pathvalidate==3.2.3
47
+ parso==0.8.4
48
+ packaging==24.2
49
+ nvidia-nvtx-cu12==12.4.127
50
+ nvidia-nvjitlink-cu12==12.4.127
51
+ nvidia-nccl-cu12==2.21.5
52
+ nvidia-curand-cu12==10.3.5.147
53
+ nvidia-cufft-cu12==11.2.1.3
54
+ nvidia-cuda-runtime-cu12==12.4.127
55
+ nvidia-cuda-nvrtc-cu12==12.4.127
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ nvidia-cublas-cu12==12.4.5.8
58
+ numpy==2.0.2
59
+ ninja==1.11.1.4
60
+ networkx==3.2.1
61
+ nest-asyncio==1.6.0
62
+ msgpack==1.1.0
63
+ MarkupSafe==3.0.2
64
+ lxml==5.3.2
65
+ joblib==1.4.2
66
+ idna==3.10
67
+ fsspec==2024.12.0
68
+ frozenlist==1.5.0
69
+ filelock==3.18.0
70
+ executing==2.2.0
71
+ exceptiongroup==1.2.2
72
+ eval_type_backport==0.2.2
73
+ einops==0.8.1
74
+ dill==0.3.8
75
+ decorator==5.2.1
76
+ debugpy==1.8.13
77
+ colorama==0.4.6
78
+ click==8.1.8
79
+ charset-normalizer==3.4.1
80
+ chardet==5.2.0
81
+ certifi==2025.1.31
82
+ attrs==25.3.0
83
+ async-timeout==5.0.1
84
+ asttokens==3.0.0
85
+ annotated-types==0.7.0
86
+ aiohappyeyeballs==2.6.1
87
+ absl-py==2.2.2
88
+ typing-inspection==0.4.0
89
+ tqdm-multiprocess==0.0.11
90
+ tensorboardX==2.6.2.2
91
+ stack-data==0.6.3
92
+ sentry-sdk==2.25.1
93
+ scipy==1.13.1
94
+ sacrebleu==2.5.1
95
+ requests==2.32.3
96
+ python-dateutil==2.9.0.post0
97
+ pydantic_core==2.33.1
98
+ omegaconf==2.3.0
99
+ nvidia-cusparse-cu12==12.3.1.170
100
+ nvidia-cudnn-cu12==9.1.0.70
101
+ numexpr==2.10.2
102
+ nltk==3.9.1
103
+ multiprocess==0.70.16
104
+ multidict==6.3.2
105
+ mbstrdecoder==1.1.4
106
+ matplotlib-inline==0.1.7
107
+ jupyter_core==5.7.2
108
+ jsonlines==4.0.0
109
+ Jinja2==3.1.6
110
+ jedi==0.19.2
111
+ importlib_resources==6.5.2
112
+ importlib_metadata==8.6.1
113
+ gitdb==4.0.12
114
+ docker-pycreds==0.4.0
115
+ comm==0.2.2
116
+ aiosignal==1.3.2
117
+ yarl==1.19.0
118
+ typepy==1.3.4
119
+ scikit-learn==1.6.1
120
+ rouge-score==0.1.2
121
+ pydantic==2.11.3
122
+ pandas==2.2.3
123
+ nvidia-cusolver-cu12==11.6.1.9
124
+ jupyter_client==8.6.3
125
+ ipython==8.18.1
126
+ huggingface-hub==0.30.2
127
+ GitPython==3.1.44
128
+ wandb==0.19.9
129
+ torch==2.6.0
130
+ tokenizers==0.15.2
131
+ ipykernel==6.29.5
132
+ aiohttp==3.11.16
133
+ transformers==4.37.0
134
+ deepspeed==0.16.5
135
+ DataProperty==1.1.0
136
+ bitsandbytes==0.45.5
137
+ accelerate==0.28.0
138
+ tabledata==1.3.4
139
+ peft==0.8.0
140
+ datasets==3.5.0
141
+ pytablewriter==1.2.1
142
+ evaluate==0.4.3
wandb/run-20250408_144213-ss3av20k/files/wandb-metadata.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-134-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.9.21",
4
+ "startedAt": "2025-04-08T14:42:13.115591Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "../models/TinyLlama_v1.1/",
9
+ "--data_path",
10
+ "../data/datasets/Llama-3-3B/mix_wiki_alpaca_8000.json",
11
+ "--model_max_length",
12
+ "1024",
13
+ "--output_dir",
14
+ "./ckpts/tinyllama_v1.1/int2-g128/",
15
+ "--logging_dir",
16
+ "./ckpts/tinyllama_v1.1/int2-g128/runs/",
17
+ "--num_train_epochs",
18
+ "4",
19
+ "--bf16",
20
+ "True",
21
+ "--seed",
22
+ "42",
23
+ "--per_device_train_batch_size",
24
+ "16",
25
+ "--per_device_eval_batch_size",
26
+ "16",
27
+ "--gradient_accumulation_steps",
28
+ "4",
29
+ "--gradient_checkpointing",
30
+ "True",
31
+ "--evaluation_strategy",
32
+ "steps",
33
+ "--eval_steps",
34
+ "40",
35
+ "--load_best_model_at_end",
36
+ "True",
37
+ "--save_strategy",
38
+ "steps",
39
+ "--save_steps",
40
+ "40",
41
+ "--save_total_limit",
42
+ "2",
43
+ "--learning_rate",
44
+ "2e-5",
45
+ "--lr_scheduler_type",
46
+ "constant",
47
+ "--weight_decay",
48
+ "0.",
49
+ "--logging_steps",
50
+ "1",
51
+ "--report_to",
52
+ "tensorboard",
53
+ "wandb",
54
+ "--deepspeed",
55
+ "config/zero.json",
56
+ "--bits",
57
+ "2",
58
+ "--quant_type",
59
+ "int2-asym",
60
+ "--q_group_size",
61
+ "128",
62
+ "--train_kd",
63
+ "True",
64
+ "--kd_loss_type",
65
+ "cakld",
66
+ "--max_train_samples",
67
+ "999999",
68
+ "--clip",
69
+ "../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt"
70
+ ],
71
+ "program": "/workspace/BitDistiller/train/train.py",
72
+ "codePath": "train/train.py",
73
+ "git": {
74
+ "remote": "git@github.com:BrownianNotion/BitDistiller.git",
75
+ "commit": "7b9cf8ae46eb4dd7fcdccefe54de4dd992197672"
76
+ },
77
+ "email": "andrewwusyd@gmail.com",
78
+ "root": "./ckpts/tinyllama_v1.1/int2-g128/",
79
+ "host": "8f7554e9d37a",
80
+ "executable": "/workspace/BitDistiller/BitDistillerVenv/bin/python3.9",
81
+ "codePathLocal": "train.py",
82
+ "cpu_count": 64,
83
+ "cpu_count_logical": 128,
84
+ "gpu": "NVIDIA A100-SXM4-80GB",
85
+ "gpu_count": 1,
86
+ "disk": {
87
+ "/": {
88
+ "total": "326417514496",
89
+ "used": "15038136320"
90
+ }
91
+ },
92
+ "memory": {
93
+ "total": "540598263808"
94
+ },
95
+ "cpu": {
96
+ "count": 64,
97
+ "countLogical": 128
98
+ },
99
+ "gpu_nvidia": [
100
+ {
101
+ "name": "NVIDIA A100-SXM4-80GB",
102
+ "memoryTotal": "85899345920",
103
+ "cudaCores": 6912,
104
+ "architecture": "Ampere"
105
+ }
106
+ ],
107
+ "cudaVersion": "12.8"
108
+ }
wandb/run-20250408_144213-ss3av20k/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:42:12.580600271Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp71xnkkod/port-6389.txt","pid":6389,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-04-08T14:42:12.583792876Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":6389}
3
+ {"time":"2025-04-08T14:42:12.583761638Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":34155,"Zone":""}}
4
+ {"time":"2025-04-08T14:42:12.76881416Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:51364"}
5
+ {"time":"2025-04-08T14:42:13.118068579Z","level":"INFO","msg":"handleInformInit: received","streamId":"ss3av20k","id":"127.0.0.1:51364"}
6
+ {"time":"2025-04-08T14:42:13.415612007Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ss3av20k","id":"127.0.0.1:51364"}
7
+ {"time":"2025-04-08T14:44:17.67801083Z","level":"INFO","msg":"received shutdown signal","signal":15}
wandb/run-20250408_144213-ss3av20k/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:42:13.118389529Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144213-ss3av20k/logs/debug-core.log"}
2
+ {"time":"2025-04-08T14:42:13.415533129Z","level":"INFO","msg":"created new stream","id":"ss3av20k"}
3
+ {"time":"2025-04-08T14:42:13.415599754Z","level":"INFO","msg":"stream: started","id":"ss3av20k"}
4
+ {"time":"2025-04-08T14:42:13.41562969Z","level":"INFO","msg":"writer: Do: started","stream_id":"ss3av20k"}
5
+ {"time":"2025-04-08T14:42:13.415679323Z","level":"INFO","msg":"sender: started","stream_id":"ss3av20k"}
6
+ {"time":"2025-04-08T14:42:13.415737612Z","level":"INFO","msg":"handler: started","stream_id":"ss3av20k"}
7
+ {"time":"2025-04-08T14:42:13.645572656Z","level":"INFO","msg":"Starting system monitor"}
wandb/run-20250408_144213-ss3av20k/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-08 14:42:13,109 INFO MainThread:6389 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9
2
+ 2025-04-08 14:42:13,109 INFO MainThread:6389 [wandb_setup.py:_flush():67] Configure stats pid to 6389
3
+ 2025-04-08 14:42:13,109 INFO MainThread:6389 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings
4
+ 2025-04-08 14:42:13,109 INFO MainThread:6389 [wandb_setup.py:_flush():67] Loading settings from /workspace/BitDistiller/train/wandb/settings
5
+ 2025-04-08 14:42:13,109 INFO MainThread:6389 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-08 14:42:13,110 INFO MainThread:6389 [wandb_init.py:setup_run_log_directory():662] Logging user logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144213-ss3av20k/logs/debug.log
7
+ 2025-04-08 14:42:13,110 INFO MainThread:6389 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144213-ss3av20k/logs/debug-internal.log
8
+ 2025-04-08 14:42:13,110 INFO MainThread:6389 [wandb_init.py:init():781] calling init triggers
9
+ 2025-04-08 14:42:13,110 INFO MainThread:6389 [wandb_init.py:init():786] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-08 14:42:13,110 INFO MainThread:6389 [wandb_init.py:init():809] starting backend
12
+ 2025-04-08 14:42:13,110 INFO MainThread:6389 [wandb_init.py:init():813] sending inform_init request
13
+ 2025-04-08 14:42:13,114 INFO MainThread:6389 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-08 14:42:13,115 INFO MainThread:6389 [wandb_init.py:init():823] backend started and connected
15
+ 2025-04-08 14:42:13,118 INFO MainThread:6389 [wandb_init.py:init():915] updated telemetry
16
+ 2025-04-08 14:42:13,309 INFO MainThread:6389 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout
17
+ 2025-04-08 14:42:13,642 INFO MainThread:6389 [wandb_init.py:init():1014] starting run threads in backend
18
+ 2025-04-08 14:42:13,750 INFO MainThread:6389 [wandb_run.py:_console_start():2454] atexit reg
19
+ 2025-04-08 14:42:13,750 INFO MainThread:6389 [wandb_run.py:_redirect():2306] redirect: wrap_raw
20
+ 2025-04-08 14:42:13,750 INFO MainThread:6389 [wandb_run.py:_redirect():2371] Wrapping output streams.
21
+ 2025-04-08 14:42:13,750 INFO MainThread:6389 [wandb_run.py:_redirect():2394] Redirects installed.
22
+ 2025-04-08 14:42:13,752 INFO MainThread:6389 [wandb_init.py:init():1056] run started, returning control to user process
23
+ 2025-04-08 14:42:56,589 INFO MainThread:6389 [wandb_run.py:_config_callback():1327] config_cb None None {'vocab_size': 32001, 'max_position_embeddings': 2048, 'hidden_size': 2048, 'intermediate_size': 5632, 'num_hidden_layers': 22, 'num_attention_heads': 32, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '../models/TinyLlama_v1.1/', 'transformers_version': '4.37.0', 'model_type': 'llama', 'output_dir': './ckpts/tinyllama_v1.1/int2-g128/', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 4.0, 'max_steps': -1, 'lr_scheduler_type': 'constant', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './ckpts/tinyllama_v1.1/int2-g128/runs/', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 40, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 40, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './ckpts/tinyllama_v1.1/int2-g128/', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': 'config/zero.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'cache_dir': None, 'model_max_length': 1024, 'bits': 2, 'q_group_size': 128, 'quant_type': 'int2-asym', 'clip': '../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt', 'train_kd': True, 'kd_tmp': 1, 'kd_loss_type': 'cakld', 'cakld_steps': 10}
wandb/run-20250408_144213-ss3av20k/run-ss3av20k.wandb ADDED
File without changes
wandb/run-20250408_144457-lfekgvx4/files/config.yaml ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: ../models/TinyLlama_v1.1/
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.19.9
6
+ m:
7
+ - "1": train/train_runtime
8
+ "5": 2
9
+ "6":
10
+ - 1
11
+ - 3
12
+ "7": []
13
+ - "1": train/global_step
14
+ "6":
15
+ - 3
16
+ "7": []
17
+ - "1": train/epoch
18
+ "5": 2
19
+ "6":
20
+ - 1
21
+ - 3
22
+ "7": []
23
+ - "1": train/total_flos
24
+ "5": 2
25
+ "6":
26
+ - 1
27
+ - 3
28
+ "7": []
29
+ - "1": eval/loss
30
+ "5": 2
31
+ "6":
32
+ - 1
33
+ - 3
34
+ "7": []
35
+ - "1": eval/runtime
36
+ "5": 2
37
+ "6":
38
+ - 1
39
+ - 3
40
+ "7": []
41
+ - "1": eval/steps_per_second
42
+ "5": 2
43
+ "6":
44
+ - 1
45
+ - 3
46
+ "7": []
47
+ - "1": train/train_samples_per_second
48
+ "5": 2
49
+ "6":
50
+ - 1
51
+ - 3
52
+ "7": []
53
+ - "1": train/train_steps_per_second
54
+ "5": 2
55
+ "6":
56
+ - 1
57
+ - 3
58
+ "7": []
59
+ - "1": train/loss
60
+ "5": 2
61
+ "6":
62
+ - 1
63
+ - 3
64
+ "7": []
65
+ - "1": train/learning_rate
66
+ "5": 2
67
+ "6":
68
+ - 1
69
+ - 3
70
+ "7": []
71
+ - "1": eval/samples_per_second
72
+ "5": 2
73
+ "6":
74
+ - 1
75
+ - 3
76
+ "7": []
77
+ - "1": train/train_loss
78
+ "5": 2
79
+ "6":
80
+ - 1
81
+ - 3
82
+ "7": []
83
+ python_version: 3.9.21
84
+ t:
85
+ "1":
86
+ - 1
87
+ - 5
88
+ - 11
89
+ - 49
90
+ - 51
91
+ - 53
92
+ - 55
93
+ - 71
94
+ - 98
95
+ "2":
96
+ - 1
97
+ - 5
98
+ - 11
99
+ - 49
100
+ - 51
101
+ - 53
102
+ - 55
103
+ - 71
104
+ - 98
105
+ "3":
106
+ - 2
107
+ - 7
108
+ - 13
109
+ - 15
110
+ - 23
111
+ - 55
112
+ - 66
113
+ "4": 3.9.21
114
+ "5": 0.19.9
115
+ "6": 4.37.0
116
+ "8":
117
+ - 5
118
+ "9":
119
+ "1": transformers_trainer
120
+ "12": 0.19.9
121
+ "13": linux-x86_64
122
+ adafactor:
123
+ value: false
124
+ adam_beta1:
125
+ value: 0.9
126
+ adam_beta2:
127
+ value: 0.999
128
+ adam_epsilon:
129
+ value: 1e-08
130
+ add_cross_attention:
131
+ value: false
132
+ architectures:
133
+ value:
134
+ - LlamaForCausalLM
135
+ attention_bias:
136
+ value: false
137
+ attention_dropout:
138
+ value: 0
139
+ auto_find_batch_size:
140
+ value: false
141
+ bad_words_ids:
142
+ value: null
143
+ begin_suppress_tokens:
144
+ value: null
145
+ bf16:
146
+ value: true
147
+ bf16_full_eval:
148
+ value: false
149
+ bits:
150
+ value: 2
151
+ bos_token_id:
152
+ value: 1
153
+ cache_dir:
154
+ value: null
155
+ cakld_steps:
156
+ value: 10
157
+ chunk_size_feed_forward:
158
+ value: 0
159
+ clip:
160
+ value: ../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt
161
+ cross_attention_hidden_size:
162
+ value: null
163
+ data_seed:
164
+ value: null
165
+ dataloader_drop_last:
166
+ value: false
167
+ dataloader_num_workers:
168
+ value: 0
169
+ dataloader_persistent_workers:
170
+ value: false
171
+ dataloader_pin_memory:
172
+ value: true
173
+ ddp_backend:
174
+ value: null
175
+ ddp_broadcast_buffers:
176
+ value: null
177
+ ddp_bucket_cap_mb:
178
+ value: null
179
+ ddp_find_unused_parameters:
180
+ value: null
181
+ ddp_timeout:
182
+ value: 1800
183
+ debug:
184
+ value: []
185
+ decoder_start_token_id:
186
+ value: null
187
+ deepspeed:
188
+ value: config/zero.json
189
+ disable_tqdm:
190
+ value: false
191
+ dispatch_batches:
192
+ value: null
193
+ diversity_penalty:
194
+ value: 0
195
+ do_eval:
196
+ value: true
197
+ do_predict:
198
+ value: false
199
+ do_sample:
200
+ value: false
201
+ do_train:
202
+ value: false
203
+ early_stopping:
204
+ value: false
205
+ encoder_no_repeat_ngram_size:
206
+ value: 0
207
+ eos_token_id:
208
+ value: 2
209
+ eval_accumulation_steps:
210
+ value: null
211
+ eval_delay:
212
+ value: 0
213
+ eval_steps:
214
+ value: 40
215
+ evaluation_strategy:
216
+ value: steps
217
+ exponential_decay_length_penalty:
218
+ value: null
219
+ finetuning_task:
220
+ value: null
221
+ forced_bos_token_id:
222
+ value: null
223
+ forced_eos_token_id:
224
+ value: null
225
+ fp16:
226
+ value: false
227
+ fp16_backend:
228
+ value: auto
229
+ fp16_full_eval:
230
+ value: false
231
+ fp16_opt_level:
232
+ value: O1
233
+ fsdp:
234
+ value: []
235
+ fsdp_config:
236
+ value:
237
+ min_num_params: 0
238
+ xla: false
239
+ xla_fsdp_grad_ckpt: false
240
+ fsdp_min_num_params:
241
+ value: 0
242
+ fsdp_transformer_layer_cls_to_wrap:
243
+ value: null
244
+ full_determinism:
245
+ value: false
246
+ gradient_accumulation_steps:
247
+ value: 4
248
+ gradient_checkpointing:
249
+ value: true
250
+ gradient_checkpointing_kwargs:
251
+ value: null
252
+ greater_is_better:
253
+ value: false
254
+ group_by_length:
255
+ value: false
256
+ half_precision_backend:
257
+ value: auto
258
+ hidden_act:
259
+ value: silu
260
+ hidden_size:
261
+ value: 2048
262
+ hub_always_push:
263
+ value: false
264
+ hub_model_id:
265
+ value: null
266
+ hub_private_repo:
267
+ value: false
268
+ hub_strategy:
269
+ value: every_save
270
+ hub_token:
271
+ value: <HUB_TOKEN>
272
+ id2label:
273
+ value:
274
+ "0": LABEL_0
275
+ "1": LABEL_1
276
+ ignore_data_skip:
277
+ value: false
278
+ include_inputs_for_metrics:
279
+ value: false
280
+ include_num_input_tokens_seen:
281
+ value: false
282
+ include_tokens_per_second:
283
+ value: false
284
+ initializer_range:
285
+ value: 0.02
286
+ intermediate_size:
287
+ value: 5632
288
+ is_decoder:
289
+ value: false
290
+ is_encoder_decoder:
291
+ value: false
292
+ jit_mode_eval:
293
+ value: false
294
+ kd_loss_type:
295
+ value: cakld
296
+ kd_tmp:
297
+ value: 1
298
+ label_names:
299
+ value: null
300
+ label_smoothing_factor:
301
+ value: 0
302
+ label2id:
303
+ value:
304
+ LABEL_0: 0
305
+ LABEL_1: 1
306
+ learning_rate:
307
+ value: 2e-05
308
+ length_column_name:
309
+ value: length
310
+ length_penalty:
311
+ value: 1
312
+ load_best_model_at_end:
313
+ value: true
314
+ local_rank:
315
+ value: 0
316
+ log_level:
317
+ value: passive
318
+ log_level_replica:
319
+ value: warning
320
+ log_on_each_node:
321
+ value: true
322
+ logging_dir:
323
+ value: ./ckpts/tinyllama_v1.1/int2-g128/runs/
324
+ logging_first_step:
325
+ value: false
326
+ logging_nan_inf_filter:
327
+ value: true
328
+ logging_steps:
329
+ value: 1
330
+ logging_strategy:
331
+ value: steps
332
+ lr_scheduler_type:
333
+ value: constant
334
+ max_grad_norm:
335
+ value: 1
336
+ max_length:
337
+ value: 20
338
+ max_position_embeddings:
339
+ value: 2048
340
+ max_steps:
341
+ value: -1
342
+ metric_for_best_model:
343
+ value: loss
344
+ min_length:
345
+ value: 0
346
+ model_max_length:
347
+ value: 1024
348
+ model_type:
349
+ value: llama
350
+ mp_parameters:
351
+ value: ""
352
+ neftune_noise_alpha:
353
+ value: null
354
+ no_cuda:
355
+ value: false
356
+ no_repeat_ngram_size:
357
+ value: 0
358
+ num_attention_heads:
359
+ value: 32
360
+ num_beam_groups:
361
+ value: 1
362
+ num_beams:
363
+ value: 1
364
+ num_hidden_layers:
365
+ value: 22
366
+ num_key_value_heads:
367
+ value: 4
368
+ num_return_sequences:
369
+ value: 1
370
+ num_train_epochs:
371
+ value: 4
372
+ optim:
373
+ value: adamw_torch
374
+ optim_args:
375
+ value: null
376
+ output_attentions:
377
+ value: false
378
+ output_dir:
379
+ value: ./ckpts/tinyllama_v1.1/int2-g128/
380
+ output_hidden_states:
381
+ value: false
382
+ output_scores:
383
+ value: false
384
+ overwrite_output_dir:
385
+ value: false
386
+ pad_token_id:
387
+ value: null
388
+ past_index:
389
+ value: -1
390
+ per_device_eval_batch_size:
391
+ value: 16
392
+ per_device_train_batch_size:
393
+ value: 16
394
+ per_gpu_eval_batch_size:
395
+ value: null
396
+ per_gpu_train_batch_size:
397
+ value: null
398
+ prediction_loss_only:
399
+ value: false
400
+ prefix:
401
+ value: null
402
+ pretraining_tp:
403
+ value: 1
404
+ problem_type:
405
+ value: null
406
+ push_to_hub:
407
+ value: false
408
+ push_to_hub_model_id:
409
+ value: null
410
+ push_to_hub_organization:
411
+ value: null
412
+ push_to_hub_token:
413
+ value: <PUSH_TO_HUB_TOKEN>
414
+ q_group_size:
415
+ value: 128
416
+ quant_type:
417
+ value: int2-asym
418
+ ray_scope:
419
+ value: last
420
+ remove_invalid_values:
421
+ value: false
422
+ remove_unused_columns:
423
+ value: true
424
+ repetition_penalty:
425
+ value: 1
426
+ report_to:
427
+ value:
428
+ - tensorboard
429
+ - wandb
430
+ resume_from_checkpoint:
431
+ value: null
432
+ return_dict:
433
+ value: true
434
+ return_dict_in_generate:
435
+ value: false
436
+ rms_norm_eps:
437
+ value: 1e-05
438
+ rope_scaling:
439
+ value: null
440
+ rope_theta:
441
+ value: 10000
442
+ run_name:
443
+ value: ./ckpts/tinyllama_v1.1/int2-g128/
444
+ save_on_each_node:
445
+ value: false
446
+ save_only_model:
447
+ value: false
448
+ save_safetensors:
449
+ value: true
450
+ save_steps:
451
+ value: 40
452
+ save_strategy:
453
+ value: steps
454
+ save_total_limit:
455
+ value: 2
456
+ seed:
457
+ value: 42
458
+ sep_token_id:
459
+ value: null
460
+ skip_memory_metrics:
461
+ value: true
462
+ split_batches:
463
+ value: false
464
+ suppress_tokens:
465
+ value: null
466
+ task_specific_params:
467
+ value: null
468
+ temperature:
469
+ value: 1
470
+ tf_legacy_loss:
471
+ value: false
472
+ tf32:
473
+ value: null
474
+ tie_encoder_decoder:
475
+ value: false
476
+ tie_word_embeddings:
477
+ value: false
478
+ tokenizer_class:
479
+ value: null
480
+ top_k:
481
+ value: 50
482
+ top_p:
483
+ value: 1
484
+ torch_compile:
485
+ value: false
486
+ torch_compile_backend:
487
+ value: null
488
+ torch_compile_mode:
489
+ value: null
490
+ torch_dtype:
491
+ value: bfloat16
492
+ torchdynamo:
493
+ value: null
494
+ torchscript:
495
+ value: false
496
+ tpu_metrics_debug:
497
+ value: false
498
+ tpu_num_cores:
499
+ value: null
500
+ train_kd:
501
+ value: true
502
+ transformers_version:
503
+ value: 4.37.0
504
+ typical_p:
505
+ value: 1
506
+ use_bfloat16:
507
+ value: false
508
+ use_cache:
509
+ value: true
510
+ use_cpu:
511
+ value: false
512
+ use_ipex:
513
+ value: false
514
+ use_legacy_prediction_loop:
515
+ value: false
516
+ use_mps_device:
517
+ value: false
518
+ vocab_size:
519
+ value: 32001
520
+ warmup_ratio:
521
+ value: 0
522
+ warmup_steps:
523
+ value: 0
524
+ weight_decay:
525
+ value: 0
wandb/run-20250408_144457-lfekgvx4/files/output.log ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /workspace/BitDistiller/BitDistillerVenv/lib/python3.9/site-packages/accelerate/accelerator.py:432: FutureWarning: Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches']). Please pass an `accelerate.DataLoaderConfiguration` instead:
2
+ dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
3
+ warnings.warn(
4
+ Using /root/.cache/torch_extensions/py39_cu124 as PyTorch extensions root...
5
+ Emitting ninja build file /root/.cache/torch_extensions/py39_cu124/cpu_adam/build.ninja...
6
+ Building extension module cpu_adam...
7
+ Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
8
+ Loading extension module cpu_adam...
9
+ Time to load cpu_adam op: 3.5851783752441406 seconds
10
+ [2025-04-08 14:45:18,142] [WARNING] [lr_schedules.py:683:get_lr] Attempting to get learning rate from scheduler before it has started
11
+ 0%| | 0/400 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
12
+ /workspace/BitDistiller/BitDistillerVenv/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py:745: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
13
+ return fn(*args, **kwargs)
14
+
15
+ {'loss': 4071.72, 'learning_rate': 0.0, 'epoch': 0.01}
16
+ {'loss': 3703.082, 'learning_rate': 2e-05, 'epoch': 0.02}
17
+ {'loss': 2781.7637, 'learning_rate': 2e-05, 'epoch': 0.03}
18
+ {'loss': 1114.2493, 'learning_rate': 2e-05, 'epoch': 0.04}
19
+ {'loss': 1199.9904, 'learning_rate': 2e-05, 'epoch': 0.05}
20
+ {'loss': 960.8013, 'learning_rate': 2e-05, 'epoch': 0.06}
21
+ {'loss': 562.4638, 'learning_rate': 2e-05, 'epoch': 0.07}
22
+ {'loss': 491.1947, 'learning_rate': 2e-05, 'epoch': 0.08}
23
+ {'loss': 676.5388, 'learning_rate': 2e-05, 'epoch': 0.09}
24
+ {'loss': 561.606, 'learning_rate': 2e-05, 'epoch': 0.1}
25
+ {'loss': 460.3655, 'learning_rate': 2e-05, 'epoch': 0.11}
26
+ {'loss': 381.4328, 'learning_rate': 2e-05, 'epoch': 0.12}
27
+ {'loss': 424.3463, 'learning_rate': 2e-05, 'epoch': 0.13}
28
+ {'loss': 453.681, 'learning_rate': 2e-05, 'epoch': 0.14}
29
+ {'loss': 315.2904, 'learning_rate': 2e-05, 'epoch': 0.15}
30
+ {'loss': 365.4308, 'learning_rate': 2e-05, 'epoch': 0.16}
31
+ {'loss': 342.7556, 'learning_rate': 2e-05, 'epoch': 0.17}
32
+ {'loss': 412.6731, 'learning_rate': 2e-05, 'epoch': 0.18}
33
+ {'loss': 338.8108, 'learning_rate': 2e-05, 'epoch': 0.19}
34
+ {'loss': 297.2161, 'learning_rate': 2e-05, 'epoch': 0.2}
35
+ {'loss': 249.6281, 'learning_rate': 2e-05, 'epoch': 0.21}
36
+ {'loss': 366.7344, 'learning_rate': 2e-05, 'epoch': 0.22}
37
+ {'loss': 244.1432, 'learning_rate': 2e-05, 'epoch': 0.23}
38
+ {'loss': 258.0739, 'learning_rate': 2e-05, 'epoch': 0.24}
39
+ {'loss': 215.2999, 'learning_rate': 2e-05, 'epoch': 0.25}
40
+ {'loss': 333.4258, 'learning_rate': 2e-05, 'epoch': 0.26}
41
+ {'loss': 283.4808, 'learning_rate': 2e-05, 'epoch': 0.27}
42
+ {'loss': 263.1878, 'learning_rate': 2e-05, 'epoch': 0.28}
43
+ {'loss': 262.2584, 'learning_rate': 2e-05, 'epoch': 0.29}
44
+ {'loss': 325.0822, 'learning_rate': 2e-05, 'epoch': 0.3}
45
+ {'loss': 311.2755, 'learning_rate': 2e-05, 'epoch': 0.31}
46
+ {'loss': 268.586, 'learning_rate': 2e-05, 'epoch': 0.32}
47
+ {'loss': 220.5104, 'learning_rate': 2e-05, 'epoch': 0.33}
48
+ {'loss': 322.1878, 'learning_rate': 2e-05, 'epoch': 0.34}
49
+ {'loss': 328.3446, 'learning_rate': 2e-05, 'epoch': 0.35}
50
+ {'loss': 312.3932, 'learning_rate': 2e-05, 'epoch': 0.36}
51
+ {'loss': 309.6428, 'learning_rate': 2e-05, 'epoch': 0.37}
52
+ {'loss': 272.0872, 'learning_rate': 2e-05, 'epoch': 0.38}
53
+ {'loss': 246.57, 'learning_rate': 2e-05, 'epoch': 0.39}
54
+ {'loss': 208.353, 'learning_rate': 2e-05, 'epoch': 0.4}
55
+ return fn(*args, **kwargs)
56
+ {'eval_loss': 257.48114013671875, 'eval_runtime': 80.3006, 'eval_samples_per_second': 19.925, 'eval_steps_per_second': 1.245, 'epoch': 0.4}
57
+
58
+ {'loss': 296.7864, 'learning_rate': 2e-05, 'epoch': 0.41}
59
+ {'loss': 229.0343, 'learning_rate': 2e-05, 'epoch': 0.42}
60
+ {'loss': 260.5302, 'learning_rate': 2e-05, 'epoch': 0.43}
61
+ {'loss': 234.2027, 'learning_rate': 2e-05, 'epoch': 0.44}
62
+ {'loss': 274.2948, 'learning_rate': 2e-05, 'epoch': 0.45}
63
+ {'loss': 286.5205, 'learning_rate': 2e-05, 'epoch': 0.46}
64
+ {'loss': 234.4089, 'learning_rate': 2e-05, 'epoch': 0.47}
65
+ {'loss': 248.8474, 'learning_rate': 2e-05, 'epoch': 0.48}
66
+ {'loss': 221.241, 'learning_rate': 2e-05, 'epoch': 0.49}
67
+ {'loss': 210.0346, 'learning_rate': 2e-05, 'epoch': 0.5}
68
+ {'loss': 211.4352, 'learning_rate': 2e-05, 'epoch': 0.51}
69
+ {'loss': 215.0837, 'learning_rate': 2e-05, 'epoch': 0.52}
70
+ {'loss': 229.1569, 'learning_rate': 2e-05, 'epoch': 0.53}
71
+ {'loss': 226.5081, 'learning_rate': 2e-05, 'epoch': 0.54}
72
+ {'loss': 235.0796, 'learning_rate': 2e-05, 'epoch': 0.55}
73
+ {'loss': 272.6122, 'learning_rate': 2e-05, 'epoch': 0.56}
74
+ {'loss': 308.3118, 'learning_rate': 2e-05, 'epoch': 0.57}
75
+ {'loss': 214.8627, 'learning_rate': 2e-05, 'epoch': 0.58}
76
+ {'loss': 262.7796, 'learning_rate': 2e-05, 'epoch': 0.59}
77
+ {'loss': 370.6855, 'learning_rate': 2e-05, 'epoch': 0.6}
78
+ {'loss': 232.5462, 'learning_rate': 2e-05, 'epoch': 0.61}
79
+ {'loss': 186.3443, 'learning_rate': 2e-05, 'epoch': 0.62}
80
+ {'loss': 244.9828, 'learning_rate': 2e-05, 'epoch': 0.63}
81
+ {'loss': 267.2331, 'learning_rate': 2e-05, 'epoch': 0.64}
82
+ {'loss': 329.9471, 'learning_rate': 2e-05, 'epoch': 0.65}
83
+ {'loss': 220.8, 'learning_rate': 2e-05, 'epoch': 0.66}
84
+ {'loss': 217.0322, 'learning_rate': 2e-05, 'epoch': 0.67}
85
+ {'loss': 236.0187, 'learning_rate': 2e-05, 'epoch': 0.68}
86
+ {'loss': 210.891, 'learning_rate': 2e-05, 'epoch': 0.69}
87
+ {'loss': 280.5106, 'learning_rate': 2e-05, 'epoch': 0.7}
88
+ {'loss': 168.9246, 'learning_rate': 2e-05, 'epoch': 0.71}
89
+ {'loss': 207.4843, 'learning_rate': 2e-05, 'epoch': 0.72}
90
+ {'loss': 201.0073, 'learning_rate': 2e-05, 'epoch': 0.73}
91
+ {'loss': 220.3829, 'learning_rate': 2e-05, 'epoch': 0.74}
92
+ {'loss': 286.062, 'learning_rate': 2e-05, 'epoch': 0.75}
93
+ {'loss': 206.1716, 'learning_rate': 2e-05, 'epoch': 0.76}
94
+ {'loss': 238.8352, 'learning_rate': 2e-05, 'epoch': 0.77}
95
+ {'loss': 259.6826, 'learning_rate': 2e-05, 'epoch': 0.78}
96
+ {'loss': 222.0822, 'learning_rate': 2e-05, 'epoch': 0.79}
97
+ {'loss': 184.0139, 'learning_rate': 2e-05, 'epoch': 0.8}
98
+ return fn(*args, **kwargs)
99
+ {'eval_loss': 216.35833740234375, 'eval_runtime': 79.5174, 'eval_samples_per_second': 20.121, 'eval_steps_per_second': 1.258, 'epoch': 0.8}
100
+
101
+ {'loss': 222.5852, 'learning_rate': 2e-05, 'epoch': 0.81}
102
+ {'loss': 230.8559, 'learning_rate': 2e-05, 'epoch': 0.82}
103
+ {'loss': 175.8692, 'learning_rate': 2e-05, 'epoch': 0.83}
104
+ {'loss': 162.3217, 'learning_rate': 2e-05, 'epoch': 0.84}
105
+ {'loss': 255.5928, 'learning_rate': 2e-05, 'epoch': 0.85}
106
+ {'loss': 299.134, 'learning_rate': 2e-05, 'epoch': 0.86}
107
+ {'loss': 168.649, 'learning_rate': 2e-05, 'epoch': 0.87}
108
+ {'loss': 220.7846, 'learning_rate': 2e-05, 'epoch': 0.88}
109
+ {'loss': 185.0179, 'learning_rate': 2e-05, 'epoch': 0.89}
110
+ {'loss': 147.8685, 'learning_rate': 2e-05, 'epoch': 0.9}
111
+ {'loss': 312.9603, 'learning_rate': 2e-05, 'epoch': 0.91}
112
+ {'loss': 313.5643, 'learning_rate': 2e-05, 'epoch': 0.92}
113
+ {'loss': 191.7977, 'learning_rate': 2e-05, 'epoch': 0.93}
114
+ {'loss': 211.8976, 'learning_rate': 2e-05, 'epoch': 0.94}
115
+ {'loss': 245.4843, 'learning_rate': 2e-05, 'epoch': 0.95}
116
+ {'loss': 323.8485, 'learning_rate': 2e-05, 'epoch': 0.96}
117
+ {'loss': 185.5315, 'learning_rate': 2e-05, 'epoch': 0.97}
118
+ {'loss': 190.9561, 'learning_rate': 2e-05, 'epoch': 0.98}
119
+ {'loss': 226.4656, 'learning_rate': 2e-05, 'epoch': 0.99}
120
+ {'loss': 224.4983, 'learning_rate': 2e-05, 'epoch': 1.0}
121
+ {'loss': 244.7243, 'learning_rate': 2e-05, 'epoch': 1.01}
122
+ {'loss': 241.325, 'learning_rate': 2e-05, 'epoch': 1.02}
123
+ {'loss': 186.9283, 'learning_rate': 2e-05, 'epoch': 1.03}
124
+ {'loss': 233.9825, 'learning_rate': 2e-05, 'epoch': 1.04}
125
+ {'loss': 229.6935, 'learning_rate': 2e-05, 'epoch': 1.05}
126
+ {'loss': 209.1252, 'learning_rate': 2e-05, 'epoch': 1.06}
127
+ {'loss': 194.21, 'learning_rate': 2e-05, 'epoch': 1.07}
128
+ {'loss': 180.2423, 'learning_rate': 2e-05, 'epoch': 1.08}
129
+ {'loss': 188.6742, 'learning_rate': 2e-05, 'epoch': 1.09}
130
+ {'loss': 165.752, 'learning_rate': 2e-05, 'epoch': 1.1}
131
+ {'loss': 190.4504, 'learning_rate': 2e-05, 'epoch': 1.11}
132
+ {'loss': 199.6766, 'learning_rate': 2e-05, 'epoch': 1.12}
133
+ {'loss': 213.1245, 'learning_rate': 2e-05, 'epoch': 1.13}
134
+ {'loss': 169.752, 'learning_rate': 2e-05, 'epoch': 1.14}
135
+ {'loss': 197.0398, 'learning_rate': 2e-05, 'epoch': 1.15}
136
+ {'loss': 216.4903, 'learning_rate': 2e-05, 'epoch': 1.16}
137
+ {'loss': 195.2799, 'learning_rate': 2e-05, 'epoch': 1.17}
138
+ {'loss': 253.0853, 'learning_rate': 2e-05, 'epoch': 1.18}
139
+ {'loss': 231.8589, 'learning_rate': 2e-05, 'epoch': 1.19}
140
+ {'loss': 163.2534, 'learning_rate': 2e-05, 'epoch': 1.2}
141
+ return fn(*args, **kwargs)
142
+ {'eval_loss': 206.70603942871094, 'eval_runtime': 79.536, 'eval_samples_per_second': 20.117, 'eval_steps_per_second': 1.257, 'epoch': 1.2}
143
+
144
+ {'loss': 161.1305, 'learning_rate': 2e-05, 'epoch': 1.21}
145
+ {'loss': 174.6565, 'learning_rate': 2e-05, 'epoch': 1.22}
146
+ {'loss': 138.2845, 'learning_rate': 2e-05, 'epoch': 1.23}
147
+ {'loss': 170.7682, 'learning_rate': 2e-05, 'epoch': 1.24}
148
+ {'loss': 167.88, 'learning_rate': 2e-05, 'epoch': 1.25}
149
+ {'loss': 186.3354, 'learning_rate': 2e-05, 'epoch': 1.26}
150
+ {'loss': 201.62, 'learning_rate': 2e-05, 'epoch': 1.27}
151
+ {'loss': 186.3384, 'learning_rate': 2e-05, 'epoch': 1.28}
152
+ {'loss': 244.3932, 'learning_rate': 2e-05, 'epoch': 1.29}
153
+ {'loss': 156.0725, 'learning_rate': 2e-05, 'epoch': 1.3}
154
+ {'loss': 254.407, 'learning_rate': 2e-05, 'epoch': 1.31}
155
+ {'loss': 222.1227, 'learning_rate': 2e-05, 'epoch': 1.32}
156
+ {'loss': 218.2704, 'learning_rate': 2e-05, 'epoch': 1.33}
157
+ {'loss': 172.4145, 'learning_rate': 2e-05, 'epoch': 1.34}
158
+ {'loss': 200.7423, 'learning_rate': 2e-05, 'epoch': 1.35}
159
+ {'loss': 149.023, 'learning_rate': 2e-05, 'epoch': 1.36}
160
+ {'loss': 155.711, 'learning_rate': 2e-05, 'epoch': 1.37}
161
+ {'loss': 219.9209, 'learning_rate': 2e-05, 'epoch': 1.38}
162
+ {'loss': 238.2001, 'learning_rate': 2e-05, 'epoch': 1.39}
163
+ {'loss': 178.3544, 'learning_rate': 2e-05, 'epoch': 1.4}
164
+ {'loss': 213.0709, 'learning_rate': 2e-05, 'epoch': 1.41}
165
+ {'loss': 240.6205, 'learning_rate': 2e-05, 'epoch': 1.42}
166
+ {'loss': 164.7779, 'learning_rate': 2e-05, 'epoch': 1.43}
167
+ {'loss': 164.4412, 'learning_rate': 2e-05, 'epoch': 1.44}
168
+ {'loss': 144.966, 'learning_rate': 2e-05, 'epoch': 1.45}
169
+ {'loss': 174.9196, 'learning_rate': 2e-05, 'epoch': 1.46}
170
+ {'loss': 148.8152, 'learning_rate': 2e-05, 'epoch': 1.47}
171
+ {'loss': 167.7342, 'learning_rate': 2e-05, 'epoch': 1.48}
172
+ {'loss': 149.2933, 'learning_rate': 2e-05, 'epoch': 1.49}
173
+ {'loss': 146.4294, 'learning_rate': 2e-05, 'epoch': 1.5}
174
+ {'loss': 283.1047, 'learning_rate': 2e-05, 'epoch': 1.51}
175
+ {'loss': 254.2791, 'learning_rate': 2e-05, 'epoch': 1.52}
176
+ {'loss': 242.7166, 'learning_rate': 2e-05, 'epoch': 1.53}
177
+ {'loss': 309.6984, 'learning_rate': 2e-05, 'epoch': 1.54}
178
+ {'loss': 268.4947, 'learning_rate': 2e-05, 'epoch': 1.55}
179
+ {'loss': 297.1925, 'learning_rate': 2e-05, 'epoch': 1.56}
180
+ {'loss': 218.9994, 'learning_rate': 2e-05, 'epoch': 1.57}
181
+ {'loss': 240.4023, 'learning_rate': 2e-05, 'epoch': 1.58}
182
+ {'loss': 188.0461, 'learning_rate': 2e-05, 'epoch': 1.59}
183
+ {'loss': 178.4397, 'learning_rate': 2e-05, 'epoch': 1.6}
184
+ return fn(*args, **kwargs)
185
+ {'eval_loss': 187.73745727539062, 'eval_runtime': 79.4337, 'eval_samples_per_second': 20.143, 'eval_steps_per_second': 1.259, 'epoch': 1.6}
186
+
187
+ {'loss': 186.6675, 'learning_rate': 2e-05, 'epoch': 1.61}
188
+ {'loss': 200.9526, 'learning_rate': 2e-05, 'epoch': 1.62}
189
+ {'loss': 133.3034, 'learning_rate': 2e-05, 'epoch': 1.63}
190
+ {'loss': 159.8653, 'learning_rate': 2e-05, 'epoch': 1.64}
191
+ {'loss': 202.0873, 'learning_rate': 2e-05, 'epoch': 1.65}
192
+ {'loss': 159.9434, 'learning_rate': 2e-05, 'epoch': 1.66}
193
+ {'loss': 162.8017, 'learning_rate': 2e-05, 'epoch': 1.67}
194
+ {'loss': 189.0643, 'learning_rate': 2e-05, 'epoch': 1.68}
195
+ {'loss': 137.1652, 'learning_rate': 2e-05, 'epoch': 1.69}
196
+ {'loss': 150.537, 'learning_rate': 2e-05, 'epoch': 1.7}
197
+ {'loss': 154.7406, 'learning_rate': 2e-05, 'epoch': 1.71}
198
+ {'loss': 169.6786, 'learning_rate': 2e-05, 'epoch': 1.72}
199
+ {'loss': 136.2818, 'learning_rate': 2e-05, 'epoch': 1.73}
200
+ {'loss': 167.8268, 'learning_rate': 2e-05, 'epoch': 1.74}
201
+ {'loss': 145.7958, 'learning_rate': 2e-05, 'epoch': 1.75}
202
+ {'loss': 154.7224, 'learning_rate': 2e-05, 'epoch': 1.76}
203
+ {'loss': 178.7872, 'learning_rate': 2e-05, 'epoch': 1.77}
204
+ {'loss': 128.0842, 'learning_rate': 2e-05, 'epoch': 1.78}
205
+ {'loss': 182.1507, 'learning_rate': 2e-05, 'epoch': 1.79}
206
+ {'loss': 150.6525, 'learning_rate': 2e-05, 'epoch': 1.8}
207
+ {'loss': 142.9066, 'learning_rate': 2e-05, 'epoch': 1.81}
208
+ {'loss': 167.2447, 'learning_rate': 2e-05, 'epoch': 1.82}
209
+ {'loss': 154.2222, 'learning_rate': 2e-05, 'epoch': 1.83}
210
+ {'loss': 126.9097, 'learning_rate': 2e-05, 'epoch': 1.84}
211
+ {'loss': 184.0902, 'learning_rate': 2e-05, 'epoch': 1.85}
212
+ {'loss': 163.555, 'learning_rate': 2e-05, 'epoch': 1.86}
213
+ {'loss': 135.321, 'learning_rate': 2e-05, 'epoch': 1.87}
214
+ {'loss': 208.7065, 'learning_rate': 2e-05, 'epoch': 1.88}
215
+ {'loss': 153.2776, 'learning_rate': 2e-05, 'epoch': 1.89}
216
+ {'loss': 108.0491, 'learning_rate': 2e-05, 'epoch': 1.9}
217
+ {'loss': 205.144, 'learning_rate': 2e-05, 'epoch': 1.91}
218
+ {'loss': 134.3983, 'learning_rate': 2e-05, 'epoch': 1.92}
219
+ {'loss': 140.7616, 'learning_rate': 2e-05, 'epoch': 1.93}
220
+ {'loss': 148.2466, 'learning_rate': 2e-05, 'epoch': 1.94}
221
+ {'loss': 146.2918, 'learning_rate': 2e-05, 'epoch': 1.95}
222
+ {'loss': 147.8167, 'learning_rate': 2e-05, 'epoch': 1.96}
223
+ {'loss': 185.2488, 'learning_rate': 2e-05, 'epoch': 1.97}
224
+ {'loss': 142.9446, 'learning_rate': 2e-05, 'epoch': 1.98}
225
+ {'loss': 157.2825, 'learning_rate': 2e-05, 'epoch': 1.99}
226
+ {'loss': 138.4444, 'learning_rate': 2e-05, 'epoch': 2.0}
227
+ return fn(*args, **kwargs)
228
+ {'eval_loss': 160.57733154296875, 'eval_runtime': 79.3738, 'eval_samples_per_second': 20.158, 'eval_steps_per_second': 1.26, 'epoch': 2.0}
229
+
230
+ {'loss': 165.2775, 'learning_rate': 2e-05, 'epoch': 2.01}
231
+ {'loss': 116.8421, 'learning_rate': 2e-05, 'epoch': 2.02}
232
+ {'loss': 146.9597, 'learning_rate': 2e-05, 'epoch': 2.03}
233
+ {'loss': 120.4051, 'learning_rate': 2e-05, 'epoch': 2.04}
234
+ {'loss': 143.2971, 'learning_rate': 2e-05, 'epoch': 2.05}
235
+ {'loss': 177.3436, 'learning_rate': 2e-05, 'epoch': 2.06}
236
+ {'loss': 150.2112, 'learning_rate': 2e-05, 'epoch': 2.07}
237
+ {'loss': 128.2945, 'learning_rate': 2e-05, 'epoch': 2.08}
238
+ {'loss': 131.5872, 'learning_rate': 2e-05, 'epoch': 2.09}
239
+ {'loss': 104.6008, 'learning_rate': 2e-05, 'epoch': 2.1}
240
+ {'loss': 123.1104, 'learning_rate': 2e-05, 'epoch': 2.11}
241
+ {'loss': 166.9186, 'learning_rate': 2e-05, 'epoch': 2.12}
242
+ {'loss': 148.2508, 'learning_rate': 2e-05, 'epoch': 2.13}
243
+ {'loss': 137.8753, 'learning_rate': 2e-05, 'epoch': 2.14}
244
+ {'loss': 158.4357, 'learning_rate': 2e-05, 'epoch': 2.15}
245
+ {'loss': 155.4972, 'learning_rate': 2e-05, 'epoch': 2.16}
246
+ {'loss': 129.5236, 'learning_rate': 2e-05, 'epoch': 2.17}
247
+ {'loss': 152.5265, 'learning_rate': 2e-05, 'epoch': 2.18}
248
+ {'loss': 124.1736, 'learning_rate': 2e-05, 'epoch': 2.19}
249
+ {'loss': 133.7729, 'learning_rate': 2e-05, 'epoch': 2.2}
250
+ {'loss': 141.1679, 'learning_rate': 2e-05, 'epoch': 2.21}
251
+ {'loss': 119.9498, 'learning_rate': 2e-05, 'epoch': 2.22}
252
+ {'loss': 116.8556, 'learning_rate': 2e-05, 'epoch': 2.23}
253
+ {'loss': 131.5023, 'learning_rate': 2e-05, 'epoch': 2.24}
254
+ {'loss': 112.0261, 'learning_rate': 2e-05, 'epoch': 2.25}
255
+ {'loss': 157.3943, 'learning_rate': 2e-05, 'epoch': 2.26}
256
+ {'loss': 168.668, 'learning_rate': 2e-05, 'epoch': 2.27}
257
+ {'loss': 124.1008, 'learning_rate': 2e-05, 'epoch': 2.28}
258
+ {'loss': 147.391, 'learning_rate': 2e-05, 'epoch': 2.29}
259
+ {'loss': 137.8744, 'learning_rate': 2e-05, 'epoch': 2.3}
260
+ {'loss': 144.7814, 'learning_rate': 2e-05, 'epoch': 2.31}
261
+ {'loss': 115.5259, 'learning_rate': 2e-05, 'epoch': 2.32}
262
+ {'loss': 130.6777, 'learning_rate': 2e-05, 'epoch': 2.33}
263
+ {'loss': 133.4718, 'learning_rate': 2e-05, 'epoch': 2.34}
264
+ {'loss': 129.8709, 'learning_rate': 2e-05, 'epoch': 2.35}
265
+ {'loss': 115.9226, 'learning_rate': 2e-05, 'epoch': 2.36}
266
+ {'loss': 190.8544, 'learning_rate': 2e-05, 'epoch': 2.37}
267
+ {'loss': 122.1731, 'learning_rate': 2e-05, 'epoch': 2.38}
268
+ {'loss': 131.187, 'learning_rate': 2e-05, 'epoch': 2.39}
269
+ {'loss': 107.896, 'learning_rate': 2e-05, 'epoch': 2.4}
270
+ return fn(*args, **kwargs)
271
+ {'eval_loss': 158.72845458984375, 'eval_runtime': 79.5651, 'eval_samples_per_second': 20.109, 'eval_steps_per_second': 1.257, 'epoch': 2.4}
272
+
273
+ {'loss': 130.5046, 'learning_rate': 2e-05, 'epoch': 2.41}
274
+ {'loss': 165.2993, 'learning_rate': 2e-05, 'epoch': 2.42}
275
+ {'loss': 111.5661, 'learning_rate': 2e-05, 'epoch': 2.43}
276
+ {'loss': 114.7366, 'learning_rate': 2e-05, 'epoch': 2.44}
277
+ {'loss': 137.7538, 'learning_rate': 2e-05, 'epoch': 2.45}
278
+ {'loss': 151.246, 'learning_rate': 2e-05, 'epoch': 2.46}
279
+ {'loss': 117.9011, 'learning_rate': 2e-05, 'epoch': 2.47}
280
+ {'loss': 147.3726, 'learning_rate': 2e-05, 'epoch': 2.48}
281
+ {'loss': 183.4265, 'learning_rate': 2e-05, 'epoch': 2.49}
282
+ {'loss': 117.1957, 'learning_rate': 2e-05, 'epoch': 2.5}
283
+ {'loss': 118.1944, 'learning_rate': 2e-05, 'epoch': 2.51}
284
+ {'loss': 114.6506, 'learning_rate': 2e-05, 'epoch': 2.52}
285
+ {'loss': 103.8539, 'learning_rate': 2e-05, 'epoch': 2.53}
286
+ {'loss': 151.9482, 'learning_rate': 2e-05, 'epoch': 2.54}
287
+ {'loss': 133.9992, 'learning_rate': 2e-05, 'epoch': 2.55}
288
+ {'loss': 128.3978, 'learning_rate': 2e-05, 'epoch': 2.56}
289
+ {'loss': 181.3923, 'learning_rate': 2e-05, 'epoch': 2.57}
290
+ {'loss': 133.9905, 'learning_rate': 2e-05, 'epoch': 2.58}
291
+ {'loss': 157.8653, 'learning_rate': 2e-05, 'epoch': 2.59}
292
+ {'loss': 118.4984, 'learning_rate': 2e-05, 'epoch': 2.6}
293
+ {'loss': 170.4937, 'learning_rate': 2e-05, 'epoch': 2.61}
294
+ {'loss': 130.0427, 'learning_rate': 2e-05, 'epoch': 2.62}
295
+ {'loss': 138.0619, 'learning_rate': 2e-05, 'epoch': 2.63}
296
+ {'loss': 130.8616, 'learning_rate': 2e-05, 'epoch': 2.64}
297
+ {'loss': 112.5762, 'learning_rate': 2e-05, 'epoch': 2.65}
298
+ {'loss': 135.076, 'learning_rate': 2e-05, 'epoch': 2.66}
299
+ {'loss': 138.3206, 'learning_rate': 2e-05, 'epoch': 2.67}
300
+ {'loss': 110.7209, 'learning_rate': 2e-05, 'epoch': 2.68}
301
+ {'loss': 139.3679, 'learning_rate': 2e-05, 'epoch': 2.69}
302
+ {'loss': 148.1507, 'learning_rate': 2e-05, 'epoch': 2.7}
303
+ {'loss': 133.993, 'learning_rate': 2e-05, 'epoch': 2.71}
304
+ {'loss': 126.9287, 'learning_rate': 2e-05, 'epoch': 2.72}
305
+ {'loss': 147.0871, 'learning_rate': 2e-05, 'epoch': 2.73}
306
+ {'loss': 119.9809, 'learning_rate': 2e-05, 'epoch': 2.74}
307
+ {'loss': 158.0596, 'learning_rate': 2e-05, 'epoch': 2.75}
308
+ {'loss': 131.581, 'learning_rate': 2e-05, 'epoch': 2.76}
309
+ {'loss': 147.5745, 'learning_rate': 2e-05, 'epoch': 2.77}
310
+ {'loss': 144.1102, 'learning_rate': 2e-05, 'epoch': 2.78}
311
+ {'loss': 130.1609, 'learning_rate': 2e-05, 'epoch': 2.79}
312
+ {'loss': 132.7878, 'learning_rate': 2e-05, 'epoch': 2.8}
313
+ return fn(*args, **kwargs)
314
+ {'eval_loss': 167.42173767089844, 'eval_runtime': 79.4589, 'eval_samples_per_second': 20.136, 'eval_steps_per_second': 1.259, 'epoch': 2.8}
315
+
316
+ {'loss': 140.5111, 'learning_rate': 2e-05, 'epoch': 2.81}
317
+ {'loss': 106.5711, 'learning_rate': 2e-05, 'epoch': 2.82}
318
+ {'loss': 132.0258, 'learning_rate': 2e-05, 'epoch': 2.83}
319
+ {'loss': 127.3889, 'learning_rate': 2e-05, 'epoch': 2.84}
320
+ {'loss': 117.4807, 'learning_rate': 2e-05, 'epoch': 2.85}
321
+ {'loss': 128.9161, 'learning_rate': 2e-05, 'epoch': 2.86}
322
+ {'loss': 124.6077, 'learning_rate': 2e-05, 'epoch': 2.87}
323
+ {'loss': 119.5677, 'learning_rate': 2e-05, 'epoch': 2.88}
324
+ {'loss': 128.5877, 'learning_rate': 2e-05, 'epoch': 2.89}
325
+ {'loss': 122.5506, 'learning_rate': 2e-05, 'epoch': 2.9}
326
+ {'loss': 136.287, 'learning_rate': 2e-05, 'epoch': 2.91}
327
+ {'loss': 135.4415, 'learning_rate': 2e-05, 'epoch': 2.92}
328
+ {'loss': 132.4624, 'learning_rate': 2e-05, 'epoch': 2.93}
329
+ {'loss': 153.5935, 'learning_rate': 2e-05, 'epoch': 2.94}
330
+ {'loss': 133.0417, 'learning_rate': 2e-05, 'epoch': 2.95}
331
+ {'loss': 131.7215, 'learning_rate': 2e-05, 'epoch': 2.96}
332
+ {'loss': 126.3043, 'learning_rate': 2e-05, 'epoch': 2.97}
333
+ {'loss': 154.6142, 'learning_rate': 2e-05, 'epoch': 2.98}
334
+ {'loss': 131.4609, 'learning_rate': 2e-05, 'epoch': 2.99}
335
+ {'loss': 131.8201, 'learning_rate': 2e-05, 'epoch': 3.0}
336
+ {'loss': 213.9727, 'learning_rate': 2e-05, 'epoch': 3.01}
337
+ {'loss': 140.8377, 'learning_rate': 2e-05, 'epoch': 3.02}
338
+ {'loss': 100.0451, 'learning_rate': 2e-05, 'epoch': 3.03}
339
+ {'loss': 88.7423, 'learning_rate': 2e-05, 'epoch': 3.04}
340
+ {'loss': 118.7, 'learning_rate': 2e-05, 'epoch': 3.05}
341
+ {'loss': 122.5366, 'learning_rate': 2e-05, 'epoch': 3.06}
342
+ {'loss': 139.358, 'learning_rate': 2e-05, 'epoch': 3.07}
343
+ {'loss': 104.6418, 'learning_rate': 2e-05, 'epoch': 3.08}
344
+ {'loss': 98.5369, 'learning_rate': 2e-05, 'epoch': 3.09}
345
+ {'loss': 140.2156, 'learning_rate': 2e-05, 'epoch': 3.1}
346
+ {'loss': 177.5249, 'learning_rate': 2e-05, 'epoch': 3.11}
347
+ {'loss': 105.4918, 'learning_rate': 2e-05, 'epoch': 3.12}
348
+ {'loss': 120.8496, 'learning_rate': 2e-05, 'epoch': 3.13}
349
+ {'loss': 106.6245, 'learning_rate': 2e-05, 'epoch': 3.14}
350
+ {'loss': 99.1943, 'learning_rate': 2e-05, 'epoch': 3.15}
351
+ {'loss': 124.0147, 'learning_rate': 2e-05, 'epoch': 3.16}
352
+ {'loss': 100.3534, 'learning_rate': 2e-05, 'epoch': 3.17}
353
+ {'loss': 105.411, 'learning_rate': 2e-05, 'epoch': 3.18}
354
+ {'loss': 107.7018, 'learning_rate': 2e-05, 'epoch': 3.19}
355
+ {'loss': 151.677, 'learning_rate': 2e-05, 'epoch': 3.2}
356
+ return fn(*args, **kwargs)
357
+ {'eval_loss': 155.28150939941406, 'eval_runtime': 79.3832, 'eval_samples_per_second': 20.155, 'eval_steps_per_second': 1.26, 'epoch': 3.2}
358
+
359
+ {'loss': 134.284, 'learning_rate': 2e-05, 'epoch': 3.21}
360
+ {'loss': 111.1241, 'learning_rate': 2e-05, 'epoch': 3.22}
361
+ {'loss': 106.1226, 'learning_rate': 2e-05, 'epoch': 3.23}
362
+ {'loss': 111.9273, 'learning_rate': 2e-05, 'epoch': 3.24}
363
+ {'loss': 101.3723, 'learning_rate': 2e-05, 'epoch': 3.25}
364
+ {'loss': 111.4945, 'learning_rate': 2e-05, 'epoch': 3.26}
365
+ {'loss': 113.5313, 'learning_rate': 2e-05, 'epoch': 3.27}
366
+ {'loss': 137.6588, 'learning_rate': 2e-05, 'epoch': 3.28}
367
+ {'loss': 110.0612, 'learning_rate': 2e-05, 'epoch': 3.29}
368
+ {'loss': 281.9487, 'learning_rate': 2e-05, 'epoch': 3.3}
369
+ {'loss': 111.9114, 'learning_rate': 2e-05, 'epoch': 3.31}
370
+ {'loss': 117.2939, 'learning_rate': 2e-05, 'epoch': 3.32}
371
+ {'loss': 152.7591, 'learning_rate': 2e-05, 'epoch': 3.33}
372
+ {'loss': 115.9719, 'learning_rate': 2e-05, 'epoch': 3.34}
373
+ {'loss': 164.3833, 'learning_rate': 2e-05, 'epoch': 3.35}
374
+ {'loss': 109.1939, 'learning_rate': 2e-05, 'epoch': 3.36}
375
+ {'loss': 117.5128, 'learning_rate': 2e-05, 'epoch': 3.37}
376
+ {'loss': 128.3874, 'learning_rate': 2e-05, 'epoch': 3.38}
377
+ {'loss': 102.7095, 'learning_rate': 2e-05, 'epoch': 3.39}
378
+ {'loss': 163.9961, 'learning_rate': 2e-05, 'epoch': 3.4}
379
+ {'loss': 108.496, 'learning_rate': 2e-05, 'epoch': 3.41}
380
+ {'loss': 123.8302, 'learning_rate': 2e-05, 'epoch': 3.42}
381
+ {'loss': 145.3561, 'learning_rate': 2e-05, 'epoch': 3.43}
382
+ {'loss': 136.6116, 'learning_rate': 2e-05, 'epoch': 3.44}
383
+ {'loss': 94.3402, 'learning_rate': 2e-05, 'epoch': 3.45}
384
+ {'loss': 152.6062, 'learning_rate': 2e-05, 'epoch': 3.46}
385
+ {'loss': 120.9629, 'learning_rate': 2e-05, 'epoch': 3.47}
386
+ {'loss': 138.7528, 'learning_rate': 2e-05, 'epoch': 3.48}
387
+ {'loss': 116.4446, 'learning_rate': 2e-05, 'epoch': 3.49}
388
+ {'loss': 120.0728, 'learning_rate': 2e-05, 'epoch': 3.5}
389
+ {'loss': 130.754, 'learning_rate': 2e-05, 'epoch': 3.51}
390
+ {'loss': 128.7542, 'learning_rate': 2e-05, 'epoch': 3.52}
391
+ {'loss': 132.6244, 'learning_rate': 2e-05, 'epoch': 3.53}
392
+ {'loss': 134.4236, 'learning_rate': 2e-05, 'epoch': 3.54}
393
+ {'loss': 120.6288, 'learning_rate': 2e-05, 'epoch': 3.55}
394
+ {'loss': 126.6259, 'learning_rate': 2e-05, 'epoch': 3.56}
395
+ {'loss': 124.4428, 'learning_rate': 2e-05, 'epoch': 3.57}
396
+ {'loss': 97.7157, 'learning_rate': 2e-05, 'epoch': 3.58}
397
+ {'loss': 131.44, 'learning_rate': 2e-05, 'epoch': 3.59}
398
+ {'loss': 120.7002, 'learning_rate': 2e-05, 'epoch': 3.6}
399
+ return fn(*args, **kwargs)
400
+ {'eval_loss': 162.07102966308594, 'eval_runtime': 79.4776, 'eval_samples_per_second': 20.131, 'eval_steps_per_second': 1.258, 'epoch': 3.6}
401
+
402
+ {'loss': 127.2023, 'learning_rate': 2e-05, 'epoch': 3.61}
403
+ {'loss': 111.7467, 'learning_rate': 2e-05, 'epoch': 3.62}
404
+ {'loss': 112.7507, 'learning_rate': 2e-05, 'epoch': 3.63}
405
+ {'loss': 107.9694, 'learning_rate': 2e-05, 'epoch': 3.64}
406
+ {'loss': 103.9277, 'learning_rate': 2e-05, 'epoch': 3.65}
407
+ {'loss': 117.3188, 'learning_rate': 2e-05, 'epoch': 3.66}
408
+ {'loss': 176.0649, 'learning_rate': 2e-05, 'epoch': 3.67}
409
+ {'loss': 163.6521, 'learning_rate': 2e-05, 'epoch': 3.68}
410
+ {'loss': 127.2647, 'learning_rate': 2e-05, 'epoch': 3.69}
411
+ {'loss': 134.6443, 'learning_rate': 2e-05, 'epoch': 3.7}
412
+ {'loss': 122.828, 'learning_rate': 2e-05, 'epoch': 3.71}
413
+ {'loss': 112.1338, 'learning_rate': 2e-05, 'epoch': 3.72}
414
+ {'loss': 110.6683, 'learning_rate': 2e-05, 'epoch': 3.73}
415
+ {'loss': 102.7184, 'learning_rate': 2e-05, 'epoch': 3.74}
416
+ {'loss': 117.4912, 'learning_rate': 2e-05, 'epoch': 3.75}
417
+ {'loss': 105.2073, 'learning_rate': 2e-05, 'epoch': 3.76}
418
+ {'loss': 136.171, 'learning_rate': 2e-05, 'epoch': 3.77}
419
+ {'loss': 106.406, 'learning_rate': 2e-05, 'epoch': 3.78}
420
+ {'loss': 147.6116, 'learning_rate': 2e-05, 'epoch': 3.79}
421
+ {'loss': 119.266, 'learning_rate': 2e-05, 'epoch': 3.8}
422
+ {'loss': 110.0501, 'learning_rate': 2e-05, 'epoch': 3.81}
423
+ {'loss': 98.5413, 'learning_rate': 2e-05, 'epoch': 3.82}
424
+ {'loss': 115.6795, 'learning_rate': 2e-05, 'epoch': 3.83}
425
+ {'loss': 102.3219, 'learning_rate': 2e-05, 'epoch': 3.84}
426
+ {'loss': 115.6896, 'learning_rate': 2e-05, 'epoch': 3.85}
427
+ {'loss': 108.7161, 'learning_rate': 2e-05, 'epoch': 3.86}
428
+ {'loss': 112.2514, 'learning_rate': 2e-05, 'epoch': 3.87}
429
+ {'loss': 142.0065, 'learning_rate': 2e-05, 'epoch': 3.88}
430
+ {'loss': 121.0375, 'learning_rate': 2e-05, 'epoch': 3.89}
431
+ {'loss': 158.0555, 'learning_rate': 2e-05, 'epoch': 3.9}
432
+ {'loss': 113.5199, 'learning_rate': 2e-05, 'epoch': 3.91}
433
+ {'loss': 126.3097, 'learning_rate': 2e-05, 'epoch': 3.92}
434
+ {'loss': 125.9837, 'learning_rate': 2e-05, 'epoch': 3.93}
435
+ {'loss': 130.3763, 'learning_rate': 2e-05, 'epoch': 3.94}
436
+ {'loss': 99.9474, 'learning_rate': 2e-05, 'epoch': 3.95}
437
+ {'loss': 109.8088, 'learning_rate': 2e-05, 'epoch': 3.96}
438
+ {'loss': 139.7369, 'learning_rate': 2e-05, 'epoch': 3.97}
439
+ {'loss': 101.4816, 'learning_rate': 2e-05, 'epoch': 3.98}
440
+ {'loss': 109.4982, 'learning_rate': 2e-05, 'epoch': 3.99}
441
+ {'loss': 107.9323, 'learning_rate': 2e-05, 'epoch': 4.0}
442
+
443
+ {'eval_loss': 147.53726196289062, 'eval_runtime': 79.4274, 'eval_samples_per_second': 20.144, 'eval_steps_per_second': 1.259, 'epoch': 4.0}
444
+ {'train_runtime': 4736.5342, 'train_samples_per_second': 5.405, 'train_steps_per_second': 0.084, 'train_loss': 210.14494161605836, 'epoch': 4.0}
wandb/run-20250408_144457-lfekgvx4/files/requirements.txt ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==58.1.0
2
+ pip==23.0.1
3
+ wcwidth==0.2.13
4
+ triton==3.2.0
5
+ sqlitedict==2.1.0
6
+ sentencepiece==0.2.0
7
+ pytz==2025.2
8
+ py-cpuinfo==9.0.0
9
+ pure_eval==0.2.3
10
+ ptyprocess==0.7.0
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ hjson==3.1.0
14
+ Fraction==2.2.0
15
+ antlr4-python3-runtime==4.9.3
16
+ zstandard==0.23.0
17
+ zipp==3.21.0
18
+ xxhash==3.5.0
19
+ urllib3==2.3.0
20
+ tzdata==2025.2
21
+ typing_extensions==4.13.1
22
+ traitlets==5.14.3
23
+ tqdm==4.67.1
24
+ tornado==6.4.2
25
+ threadpoolctl==3.6.0
26
+ tcolorpy==0.1.7
27
+ tabulate==0.9.0
28
+ sympy==1.13.1
29
+ smmap==5.0.2
30
+ six==1.17.0
31
+ setproctitle==1.3.5
32
+ safetensors==0.5.3
33
+ regex==2024.11.6
34
+ pyzmq==26.4.0
35
+ PyYAML==6.0.2
36
+ Pygments==2.19.1
37
+ pycountry==24.6.1
38
+ pyarrow==19.0.1
39
+ psutil==7.0.0
40
+ protobuf==5.29.4
41
+ propcache==0.3.1
42
+ prompt_toolkit==3.0.50
43
+ portalocker==3.1.1
44
+ platformdirs==4.3.7
45
+ pexpect==4.9.0
46
+ pathvalidate==3.2.3
47
+ parso==0.8.4
48
+ packaging==24.2
49
+ nvidia-nvtx-cu12==12.4.127
50
+ nvidia-nvjitlink-cu12==12.4.127
51
+ nvidia-nccl-cu12==2.21.5
52
+ nvidia-curand-cu12==10.3.5.147
53
+ nvidia-cufft-cu12==11.2.1.3
54
+ nvidia-cuda-runtime-cu12==12.4.127
55
+ nvidia-cuda-nvrtc-cu12==12.4.127
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ nvidia-cublas-cu12==12.4.5.8
58
+ numpy==2.0.2
59
+ ninja==1.11.1.4
60
+ networkx==3.2.1
61
+ nest-asyncio==1.6.0
62
+ msgpack==1.1.0
63
+ MarkupSafe==3.0.2
64
+ lxml==5.3.2
65
+ joblib==1.4.2
66
+ idna==3.10
67
+ fsspec==2024.12.0
68
+ frozenlist==1.5.0
69
+ filelock==3.18.0
70
+ executing==2.2.0
71
+ exceptiongroup==1.2.2
72
+ eval_type_backport==0.2.2
73
+ einops==0.8.1
74
+ dill==0.3.8
75
+ decorator==5.2.1
76
+ debugpy==1.8.13
77
+ colorama==0.4.6
78
+ click==8.1.8
79
+ charset-normalizer==3.4.1
80
+ chardet==5.2.0
81
+ certifi==2025.1.31
82
+ attrs==25.3.0
83
+ async-timeout==5.0.1
84
+ asttokens==3.0.0
85
+ annotated-types==0.7.0
86
+ aiohappyeyeballs==2.6.1
87
+ absl-py==2.2.2
88
+ typing-inspection==0.4.0
89
+ tqdm-multiprocess==0.0.11
90
+ tensorboardX==2.6.2.2
91
+ stack-data==0.6.3
92
+ sentry-sdk==2.25.1
93
+ scipy==1.13.1
94
+ sacrebleu==2.5.1
95
+ requests==2.32.3
96
+ python-dateutil==2.9.0.post0
97
+ pydantic_core==2.33.1
98
+ omegaconf==2.3.0
99
+ nvidia-cusparse-cu12==12.3.1.170
100
+ nvidia-cudnn-cu12==9.1.0.70
101
+ numexpr==2.10.2
102
+ nltk==3.9.1
103
+ multiprocess==0.70.16
104
+ multidict==6.3.2
105
+ mbstrdecoder==1.1.4
106
+ matplotlib-inline==0.1.7
107
+ jupyter_core==5.7.2
108
+ jsonlines==4.0.0
109
+ Jinja2==3.1.6
110
+ jedi==0.19.2
111
+ importlib_resources==6.5.2
112
+ importlib_metadata==8.6.1
113
+ gitdb==4.0.12
114
+ docker-pycreds==0.4.0
115
+ comm==0.2.2
116
+ aiosignal==1.3.2
117
+ yarl==1.19.0
118
+ typepy==1.3.4
119
+ scikit-learn==1.6.1
120
+ rouge-score==0.1.2
121
+ pydantic==2.11.3
122
+ pandas==2.2.3
123
+ nvidia-cusolver-cu12==11.6.1.9
124
+ jupyter_client==8.6.3
125
+ ipython==8.18.1
126
+ huggingface-hub==0.30.2
127
+ GitPython==3.1.44
128
+ wandb==0.19.9
129
+ torch==2.6.0
130
+ tokenizers==0.15.2
131
+ ipykernel==6.29.5
132
+ aiohttp==3.11.16
133
+ transformers==4.37.0
134
+ deepspeed==0.16.5
135
+ DataProperty==1.1.0
136
+ bitsandbytes==0.45.5
137
+ accelerate==0.28.0
138
+ tabledata==1.3.4
139
+ peft==0.8.0
140
+ datasets==3.5.0
141
+ pytablewriter==1.2.1
142
+ evaluate==0.4.3
wandb/run-20250408_144457-lfekgvx4/files/wandb-metadata.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-134-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.9.21",
4
+ "startedAt": "2025-04-08T14:44:57.017674Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "../models/TinyLlama_v1.1/",
9
+ "--data_path",
10
+ "../data/datasets/Llama-3-3B/mix_wiki_alpaca_8000.json",
11
+ "--model_max_length",
12
+ "1024",
13
+ "--output_dir",
14
+ "./ckpts/tinyllama_v1.1/int2-g128/",
15
+ "--logging_dir",
16
+ "./ckpts/tinyllama_v1.1/int2-g128/runs/",
17
+ "--num_train_epochs",
18
+ "4",
19
+ "--bf16",
20
+ "True",
21
+ "--seed",
22
+ "42",
23
+ "--per_device_train_batch_size",
24
+ "16",
25
+ "--per_device_eval_batch_size",
26
+ "16",
27
+ "--gradient_accumulation_steps",
28
+ "4",
29
+ "--gradient_checkpointing",
30
+ "True",
31
+ "--evaluation_strategy",
32
+ "steps",
33
+ "--eval_steps",
34
+ "40",
35
+ "--load_best_model_at_end",
36
+ "True",
37
+ "--save_strategy",
38
+ "steps",
39
+ "--save_steps",
40
+ "40",
41
+ "--save_total_limit",
42
+ "2",
43
+ "--learning_rate",
44
+ "2e-5",
45
+ "--lr_scheduler_type",
46
+ "constant",
47
+ "--weight_decay",
48
+ "0.",
49
+ "--logging_steps",
50
+ "1",
51
+ "--report_to",
52
+ "tensorboard",
53
+ "wandb",
54
+ "--deepspeed",
55
+ "config/zero.json",
56
+ "--bits",
57
+ "2",
58
+ "--quant_type",
59
+ "int2-asym",
60
+ "--q_group_size",
61
+ "128",
62
+ "--train_kd",
63
+ "True",
64
+ "--kd_loss_type",
65
+ "cakld",
66
+ "--max_train_samples",
67
+ "999999",
68
+ "--clip",
69
+ "../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt"
70
+ ],
71
+ "program": "/workspace/BitDistiller/train/train.py",
72
+ "codePath": "train/train.py",
73
+ "git": {
74
+ "remote": "git@github.com:BrownianNotion/BitDistiller.git",
75
+ "commit": "7b9cf8ae46eb4dd7fcdccefe54de4dd992197672"
76
+ },
77
+ "email": "andrewwusyd@gmail.com",
78
+ "root": "./ckpts/tinyllama_v1.1/int2-g128/",
79
+ "host": "8f7554e9d37a",
80
+ "executable": "/workspace/BitDistiller/BitDistillerVenv/bin/python3.9",
81
+ "codePathLocal": "train.py",
82
+ "cpu_count": 64,
83
+ "cpu_count_logical": 128,
84
+ "gpu": "NVIDIA A100-SXM4-80GB",
85
+ "gpu_count": 1,
86
+ "disk": {
87
+ "/": {
88
+ "total": "326417514496",
89
+ "used": "15332990976"
90
+ }
91
+ },
92
+ "memory": {
93
+ "total": "540598263808"
94
+ },
95
+ "cpu": {
96
+ "count": 64,
97
+ "countLogical": 128
98
+ },
99
+ "gpu_nvidia": [
100
+ {
101
+ "name": "NVIDIA A100-SXM4-80GB",
102
+ "memoryTotal": "85899345920",
103
+ "cudaCores": 6912,
104
+ "architecture": "Ampere"
105
+ }
106
+ ],
107
+ "cudaVersion": "12.8"
108
+ }
wandb/run-20250408_144457-lfekgvx4/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eval/steps_per_second":1.259,"train/global_step":400,"train/train_samples_per_second":5.405,"_wandb":{"runtime":4761},"eval/runtime":79.4274,"eval/samples_per_second":20.144,"_timestamp":1.744128254682862e+09,"train/learning_rate":2e-05,"train/train_steps_per_second":0.084,"train/total_flos":1.6260292597199667e+17,"train/epoch":4,"eval/loss":147.53726196289062,"train/loss":107.9323,"_runtime":4757.665722468,"train/train_loss":210.14494161605836,"_step":410,"train/train_runtime":4736.5342}
wandb/run-20250408_144457-lfekgvx4/logs/debug-core.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:44:56.482488302Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp38v1mewj/port-8387.txt","pid":8387,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-04-08T14:44:56.497266323Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":8387}
3
+ {"time":"2025-04-08T14:44:56.497276172Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36187,"Zone":""}}
4
+ {"time":"2025-04-08T14:44:56.670419687Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:60576"}
5
+ {"time":"2025-04-08T14:44:57.020718698Z","level":"INFO","msg":"handleInformInit: received","streamId":"lfekgvx4","id":"127.0.0.1:60576"}
6
+ {"time":"2025-04-08T14:44:57.323350837Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"lfekgvx4","id":"127.0.0.1:60576"}
7
+ {"time":"2025-04-08T16:04:19.13096311Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"lfekgvx4","id":"127.0.0.1:60576"}
8
+ {"time":"2025-04-08T16:04:19.131291495Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"lfekgvx4","id":"127.0.0.1:60576"}
9
+ {"time":"2025-04-08T16:04:20.130224152Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:60576"}
10
+ {"time":"2025-04-08T16:04:20.130274767Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:60576"}
11
+ {"time":"2025-04-08T16:04:20.130288583Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-04-08T16:04:20.130290526Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:60576"}
13
+ {"time":"2025-04-08T16:04:20.130472868Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:60576"}
14
+ {"time":"2025-04-08T16:04:20.130505839Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:60576"}
15
+ {"time":"2025-04-08T16:04:20.130521489Z","level":"INFO","msg":"server is closed"}
wandb/run-20250408_144457-lfekgvx4/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-08T14:44:57.021153491Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144457-lfekgvx4/logs/debug-core.log"}
2
+ {"time":"2025-04-08T14:44:57.323241702Z","level":"INFO","msg":"created new stream","id":"lfekgvx4"}
3
+ {"time":"2025-04-08T14:44:57.323338343Z","level":"INFO","msg":"stream: started","id":"lfekgvx4"}
4
+ {"time":"2025-04-08T14:44:57.323398396Z","level":"INFO","msg":"writer: Do: started","stream_id":"lfekgvx4"}
5
+ {"time":"2025-04-08T14:44:57.323467054Z","level":"INFO","msg":"handler: started","stream_id":"lfekgvx4"}
6
+ {"time":"2025-04-08T14:44:57.323506598Z","level":"INFO","msg":"sender: started","stream_id":"lfekgvx4"}
7
+ {"time":"2025-04-08T14:44:57.614102027Z","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-04-08T16:04:18.122439099Z","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-04-08T16:04:18.122764017Z","level":"INFO","msg":"Stopped system monitor"}
10
+ {"time":"2025-04-08T16:04:18.897381609Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2025-04-08T16:04:19.123406575Z","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2025-04-08T16:04:19.131020207Z","level":"INFO","msg":"stream: closing","id":"lfekgvx4"}
13
+ {"time":"2025-04-08T16:04:19.131051826Z","level":"INFO","msg":"handler: closed","stream_id":"lfekgvx4"}
14
+ {"time":"2025-04-08T16:04:19.131065982Z","level":"INFO","msg":"writer: Close: closed","stream_id":"lfekgvx4"}
15
+ {"time":"2025-04-08T16:04:19.131087513Z","level":"INFO","msg":"sender: closed","stream_id":"lfekgvx4"}
16
+ {"time":"2025-04-08T16:04:19.131275715Z","level":"INFO","msg":"stream: closed","id":"lfekgvx4"}
wandb/run-20250408_144457-lfekgvx4/logs/debug.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-08 14:44:57,011 INFO MainThread:8387 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9
2
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_setup.py:_flush():67] Configure stats pid to 8387
3
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings
4
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_setup.py:_flush():67] Loading settings from /workspace/BitDistiller/train/wandb/settings
5
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:setup_run_log_directory():662] Logging user logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144457-lfekgvx4/logs/debug.log
7
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to ./ckpts/tinyllama_v1.1/int2-g128/wandb/run-20250408_144457-lfekgvx4/logs/debug-internal.log
8
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:init():781] calling init triggers
9
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:init():786] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:init():809] starting backend
12
+ 2025-04-08 14:44:57,012 INFO MainThread:8387 [wandb_init.py:init():813] sending inform_init request
13
+ 2025-04-08 14:44:57,017 INFO MainThread:8387 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-08 14:44:57,017 INFO MainThread:8387 [wandb_init.py:init():823] backend started and connected
15
+ 2025-04-08 14:44:57,020 INFO MainThread:8387 [wandb_init.py:init():915] updated telemetry
16
+ 2025-04-08 14:44:57,211 INFO MainThread:8387 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout
17
+ 2025-04-08 14:44:57,610 INFO MainThread:8387 [wandb_init.py:init():1014] starting run threads in backend
18
+ 2025-04-08 14:44:57,712 INFO MainThread:8387 [wandb_run.py:_console_start():2454] atexit reg
19
+ 2025-04-08 14:44:57,712 INFO MainThread:8387 [wandb_run.py:_redirect():2306] redirect: wrap_raw
20
+ 2025-04-08 14:44:57,712 INFO MainThread:8387 [wandb_run.py:_redirect():2371] Wrapping output streams.
21
+ 2025-04-08 14:44:57,712 INFO MainThread:8387 [wandb_run.py:_redirect():2394] Redirects installed.
22
+ 2025-04-08 14:44:57,714 INFO MainThread:8387 [wandb_init.py:init():1056] run started, returning control to user process
23
+ 2025-04-08 14:45:18,149 INFO MainThread:8387 [wandb_run.py:_config_callback():1327] config_cb None None {'vocab_size': 32001, 'max_position_embeddings': 2048, 'hidden_size': 2048, 'intermediate_size': 5632, 'num_hidden_layers': 22, 'num_attention_heads': 32, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '../models/TinyLlama_v1.1/', 'transformers_version': '4.37.0', 'model_type': 'llama', 'output_dir': './ckpts/tinyllama_v1.1/int2-g128/', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 4.0, 'max_steps': -1, 'lr_scheduler_type': 'constant', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './ckpts/tinyllama_v1.1/int2-g128/runs/', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 40, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 40, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': './ckpts/tinyllama_v1.1/int2-g128/', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': 'config/zero.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'cache_dir': None, 'model_max_length': 1024, 'bits': 2, 'q_group_size': 128, 'quant_type': 'int2-asym', 'clip': '../quantization/clip_cache/TinyLlama_v1.1/int2-g128.pt', 'train_kd': True, 'kd_tmp': 1, 'kd_loss_type': 'cakld', 'cakld_steps': 10}
24
+ 2025-04-08 16:04:18,120 INFO MainThread:8387 [wandb_run.py:_finish():2189] finishing run DeepFriedNLP/SNLP_BitDistiller/lfekgvx4
25
+ 2025-04-08 16:04:18,121 INFO MainThread:8387 [wandb_run.py:_atexit_cleanup():2419] got exitcode: 0
26
+ 2025-04-08 16:04:18,121 INFO MainThread:8387 [wandb_run.py:_restore():2401] restore
27
+ 2025-04-08 16:04:18,121 INFO MainThread:8387 [wandb_run.py:_restore():2407] restore done
28
+ 2025-04-08 16:04:19,128 INFO MainThread:8387 [wandb_run.py:_footer_history_summary_info():4064] rendering history
29
+ 2025-04-08 16:04:19,129 INFO MainThread:8387 [wandb_run.py:_footer_history_summary_info():4096] rendering summary
30
+ 2025-04-08 16:04:19,130 INFO MainThread:8387 [wandb_run.py:_footer_sync_info():4025] logging synced files
wandb/run-20250408_144457-lfekgvx4/run-lfekgvx4.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a8068266451df8f9cf073af1b4e65e3bebf77454b85383b56fdb31e0c35d5b
3
+ size 1228214