sanchit-gandhi HF staff commited on
Commit
718d1ae
1 Parent(s): ad515d9

Training in progress, step 100

Browse files
accelerate_config.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ enable_cpu_affinity: false
6
+ gpu_ids: all
7
+ machine_rank: 0
8
+ main_training_function: main
9
+ mixed_precision: bf16
10
+ num_machines: 1
11
+ num_processes: 8
12
+ rdzv_backend: static
13
+ same_network: true
14
+ tpu_env: []
15
+ tpu_use_cluster: false
16
+ tpu_use_sudo: false
17
+ use_cpu: false
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 14336,
13
+ "max_position_embeddings": 32768,
14
+ "model_type": "mistral",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 6,
17
+ "num_key_value_heads": 8,
18
+ "rms_norm_eps": 1e-05,
19
+ "rope_theta": 10000.0,
20
+ "sliding_window": 4096,
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.40.1",
24
+ "use_cache": false,
25
+ "vocab_size": 32000
26
+ }
config_200k.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k
3
+ torch_dtype: null
4
+
5
+ # Data training arguments
6
+ # For definitions, see: src/h4/training/config.py
7
+ dataset_mixer:
8
+ HuggingFaceH4/ultrafeedback_binarized: 1.0
9
+ dataset_splits:
10
+ - train_prefs
11
+ - test_prefs
12
+ preprocessing_num_workers: 12
13
+
14
+ # DPOTrainer arguments
15
+ bf16: true
16
+ beta: 0.01
17
+ do_eval: true
18
+ evaluation_strategy: steps
19
+ eval_steps: 100
20
+ gradient_accumulation_steps: 2
21
+ gradient_checkpointing: true
22
+ gradient_checkpointing_kwargs:
23
+ use_reentrant: False
24
+ learning_rate: 5.0e-7
25
+ log_level: info
26
+ logging_steps: 25
27
+ lr_scheduler_type: cosine
28
+ max_length: 1024
29
+ max_prompt_length: 512
30
+ num_train_epochs: 1
31
+ optim: adamw_torch
32
+ output_dir: ./
33
+ per_device_train_batch_size: 8
34
+ per_device_eval_batch_size: 8
35
+ push_to_hub: true
36
+ save_strategy: "steps"
37
+ save_steps: 100
38
+ save_total_limit: 1
39
+ seed: 42
40
+ warmup_ratio: 0.1
41
+ report_to:
42
+ - tensorboard
43
+ - wandb
run_dpo.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ import logging
17
+ import random
18
+ import sys
19
+
20
+ import torch
21
+ import transformers
22
+ from transformers import AutoModelForCausalLM, set_seed
23
+
24
+ from alignment import (
25
+ DataArguments,
26
+ DPOConfig,
27
+ H4ArgumentParser,
28
+ ModelArguments,
29
+ apply_chat_template,
30
+ get_checkpoint,
31
+ get_datasets,
32
+ get_kbit_device_map,
33
+ get_peft_config,
34
+ get_quantization_config,
35
+ get_tokenizer,
36
+ is_adapter_model,
37
+ )
38
+ from peft import PeftConfig, PeftModel
39
+ from trl import DPOTrainer
40
+
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ def main():
46
+ parser = H4ArgumentParser((ModelArguments, DataArguments, DPOConfig))
47
+ model_args, data_args, training_args = parser.parse()
48
+
49
+ #######
50
+ # Setup
51
+ #######
52
+ logging.basicConfig(
53
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
54
+ datefmt="%Y-%m-%d %H:%M:%S",
55
+ handlers=[logging.StreamHandler(sys.stdout)],
56
+ )
57
+ log_level = training_args.get_process_log_level()
58
+ logger.setLevel(log_level)
59
+ transformers.utils.logging.set_verbosity(log_level)
60
+ transformers.utils.logging.enable_default_handler()
61
+ transformers.utils.logging.enable_explicit_format()
62
+
63
+ # Log on each process the small summary:
64
+ logger.info(f"Model parameters {model_args}")
65
+ logger.info(f"Data parameters {data_args}")
66
+ logger.info(f"Training/evaluation parameters {training_args}")
67
+
68
+ # Check for last checkpoint
69
+ last_checkpoint = get_checkpoint(training_args)
70
+ if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
71
+ logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
72
+
73
+ # Set seed for reproducibility
74
+ set_seed(training_args.seed)
75
+
76
+ ###############
77
+ # Load datasets
78
+ ###############
79
+ raw_datasets = get_datasets(data_args, splits=data_args.dataset_splits)
80
+ logger.info(
81
+ f"Training on the following splits: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
82
+ )
83
+ column_names = list(raw_datasets["train"].features)
84
+
85
+ #####################################
86
+ # Load tokenizer and process datasets
87
+ #####################################
88
+ data_args.truncation_side = "left" # Truncate from left to ensure we don't lose labels in final turn
89
+ tokenizer = get_tokenizer(model_args, data_args)
90
+
91
+ #####################
92
+ # Apply chat template
93
+ #####################
94
+ raw_datasets = raw_datasets.map(
95
+ apply_chat_template,
96
+ fn_kwargs={"tokenizer": tokenizer, "task": "dpo"},
97
+ num_proc=data_args.preprocessing_num_workers,
98
+ remove_columns=column_names,
99
+ desc="Formatting comparisons with prompt template",
100
+ )
101
+
102
+ # Replace column names with what TRL needs, text_chosen -> chosen and text_rejected -> rejected
103
+ for split in ["train", "test"]:
104
+ raw_datasets[split] = raw_datasets[split].rename_columns(
105
+ {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
106
+ )
107
+
108
+ # Log a few random samples from the training set:
109
+ for index in random.sample(range(len(raw_datasets["train"])), 3):
110
+ logger.info(f"Prompt sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['prompt']}")
111
+ logger.info(f"Chosen sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['chosen']}")
112
+ logger.info(f"Rejected sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['rejected']}")
113
+
114
+ torch_dtype = (
115
+ model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
116
+ )
117
+ quantization_config = get_quantization_config(model_args)
118
+
119
+ model_kwargs = dict(
120
+ revision=model_args.model_revision,
121
+ trust_remote_code=model_args.trust_remote_code,
122
+ use_flash_attention_2=model_args.use_flash_attention_2,
123
+ torch_dtype=torch_dtype,
124
+ use_cache=False if training_args.gradient_checkpointing else True,
125
+ device_map=get_kbit_device_map() if quantization_config is not None else None,
126
+ quantization_config=quantization_config,
127
+ )
128
+
129
+ model = model_args.model_name_or_path
130
+ if is_adapter_model(model, model_args.model_revision) is True:
131
+ # Load the base model, merge the adapter weights and unload the adapter
132
+ # Note: to run QLoRA, you will need to merge the base model separately as the merged model in 16bit
133
+ logger.info(f"Merging PEFT adapters for {model_args.model_name_or_path=}")
134
+
135
+ peft_config = PeftConfig.from_pretrained(model_args.model_name_or_path, revision=model_args.model_revision)
136
+
137
+ model_kwargs = dict(
138
+ revision=model_args.base_model_revision,
139
+ trust_remote_code=model_args.trust_remote_code,
140
+ use_flash_attention_2=model_args.use_flash_attention_2,
141
+ torch_dtype=torch_dtype,
142
+ use_cache=False if training_args.gradient_checkpointing else True,
143
+ )
144
+ base_model = AutoModelForCausalLM.from_pretrained(
145
+ peft_config.base_model_name_or_path,
146
+ **model_kwargs,
147
+ )
148
+ model = PeftModel.from_pretrained(
149
+ base_model, model_args.model_name_or_path, revision=model_args.model_revision
150
+ )
151
+ model.eval()
152
+ model = model.merge_and_unload()
153
+ model_kwargs = None
154
+
155
+ ref_model = model
156
+ ref_model_kwargs = model_kwargs
157
+
158
+ if model_args.use_peft is True:
159
+ ref_model = None
160
+ ref_model_kwargs = None
161
+
162
+ #########################
163
+ # Instantiate DPO trainer
164
+ #########################
165
+ trainer = DPOTrainer(
166
+ model,
167
+ ref_model,
168
+ model_init_kwargs=model_kwargs,
169
+ ref_model_init_kwargs=ref_model_kwargs,
170
+ args=training_args,
171
+ beta=training_args.beta,
172
+ train_dataset=raw_datasets["train"],
173
+ eval_dataset=raw_datasets["test"],
174
+ tokenizer=tokenizer,
175
+ max_length=training_args.max_length,
176
+ max_prompt_length=training_args.max_prompt_length,
177
+ peft_config=get_peft_config(model_args),
178
+ loss_type=training_args.loss_type,
179
+ )
180
+
181
+ ###############
182
+ # Training loop
183
+ ###############
184
+ checkpoint = None
185
+ if training_args.resume_from_checkpoint is not None:
186
+ checkpoint = training_args.resume_from_checkpoint
187
+ elif last_checkpoint is not None:
188
+ checkpoint = last_checkpoint
189
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
190
+ metrics = train_result.metrics
191
+ metrics["train_samples"] = len(raw_datasets["train"])
192
+ trainer.log_metrics("train", metrics)
193
+ trainer.save_metrics("train", metrics)
194
+ trainer.save_state()
195
+
196
+ logger.info("*** Training complete ***")
197
+
198
+ ##########
199
+ # Evaluate
200
+ ##########
201
+ if training_args.do_eval:
202
+ logger.info("*** Evaluate ***")
203
+ metrics = trainer.evaluate()
204
+ metrics["eval_samples"] = len(raw_datasets["test"])
205
+ trainer.log_metrics("eval", metrics)
206
+ trainer.save_metrics("eval", metrics)
207
+
208
+ ##################################
209
+ # Save model and create model card
210
+ ##################################
211
+ logger.info("*** Save model ***")
212
+ trainer.save_model(training_args.output_dir)
213
+ logger.info(f"Model saved to {training_args.output_dir}")
214
+
215
+ # Save everything else on main process
216
+ kwargs = {
217
+ "finetuned_from": model_args.model_name_or_path,
218
+ "dataset": list(data_args.dataset_mixer.keys()),
219
+ "dataset_tags": list(data_args.dataset_mixer.keys()),
220
+ "tags": ["alignment-handbook"],
221
+ }
222
+ if trainer.accelerator.is_main_process:
223
+ trainer.create_model_card(**kwargs)
224
+ # Restore k,v cache for fast inference
225
+ trainer.model.config.use_cache = True
226
+ trainer.model.config.save_pretrained(training_args.output_dir)
227
+
228
+ if training_args.push_to_hub is True:
229
+ logger.info("Pushing to hub...")
230
+ trainer.push_to_hub(**kwargs)
231
+
232
+ logger.info("*** Training complete! ***")
233
+
234
+
235
+ if __name__ == "__main__":
236
+ main()
237
+
runs/Apr26_16-38-17_ip-26-0-161-178/events.out.tfevents.1714149975.ip-26-0-161-178.306341.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1965c5ca22ae4f3f8b62fc21e894bc3ef67848a9d40a53379b8db44b2fdb9d04
3
+ size 8887
slurm_job.slurm ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=distil-zephyr
3
+ #SBATCH --nodes=1
4
+ # set 24h for job wall time limit
5
+ #SBATCH --time=4:00:00
6
+ #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
7
+ #SBATCH --cpus-per-task=32
8
+ #SBATCH --gres=gpu:8
9
+ #SBATCH --exclusive
10
+ #SBATCH --partition=hopper-prod
11
+ #SBATCH --output=/fsx/sanchit/alignment-logs/%x-%j.out
12
+
13
+ set -x -e
14
+
15
+ # START EDIT
16
+ source ~/.bashrc
17
+ source /fsx/sanchit/miniconda3/bin/activate alignment
18
+
19
+ LOG_PATH="/fsx/sanchit/alignment-logs/main_log.txt"
20
+ SAVE_DIR="/fsx/sanchit"
21
+ # END EDIT
22
+
23
+ echo "START TIME: $(date)"
24
+
25
+ GPUS_PER_NODE=8
26
+ NNODES=$SLURM_NNODES
27
+
28
+ # so processes know who to talk to
29
+ MASTER_ADDR=`scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1`
30
+
31
+ # From https://i.hsfzxjy.site/2021-03-10-obtain-a-random-unused-tcp-port-with-bash/
32
+ function unused_port() {
33
+ N=${1:-1}
34
+ comm -23 \
35
+ <(seq "1025" "65535" | sort) \
36
+ <(ss -Htan |
37
+ awk '{print $4}' |
38
+ cut -d':' -f2 |
39
+ sort -u) |
40
+ shuf |
41
+ head -n "$N"
42
+ }
43
+ MASTER_PORT=$(unused_port)
44
+
45
+ # export TORCH_CPP_LOG_LEVEL=INFO
46
+ # export TORCH_DISTRIBUTED_DEBUG=DETAIL
47
+
48
+ export LAUNCHER="python -u -m accelerate.commands.launch --config_file ./accelerate_config.yaml"
49
+
50
+ export PROGRAM="./run_dpo.py ./config_200k.yaml"
51
+ export CMD="$LAUNCHER $PROGRAM"
52
+ echo $CMD
53
+
54
+ SRUN_ARGS=" \
55
+ --wait=60 \
56
+ --kill-on-bad-exit=1 \
57
+ "
58
+
59
+ # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
60
+ clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$CMD" 2>&1 | tee -a $SAVE_DIR/logs/main_log.txt
61
+
62
+
63
+ # srun error handling:
64
+ # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
65
+ # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
66
+
67
+ # SRUN_ARGS=" \
68
+ # --wait=60 \
69
+ # --kill-on-bad-exit=1 \
70
+ # "
71
+ #
72
+ # # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
73
+ # clear; srun $SRUN_ARGS --jobid $SLURM_JOBID bash -c "$CMD" 2>&1 | tee -a $SAVE_DIR/logs/main_log.txt
74
+
75
+ echo "END TIME: $(date)"
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a158eefb40afd96d29eabbb6f62d58ed873e0be663e31aa892ec807013cc3c2a
3
+ size 5112
wandb/debug-internal.log ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-26 16:46:17,106 INFO StreamThr :307541 [internal.py:wandb_internal():86] W&B internal server running at pid: 307541, started at: 2024-04-26 16:46:17.104548
2
+ 2024-04-26 16:46:17,107 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status
3
+ 2024-04-26 16:46:17,112 INFO WriterThread:307541 [datastore.py:open_for_write():87] open: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/run-71zld9et.wandb
4
+ 2024-04-26 16:46:17,113 DEBUG SenderThread:307541 [sender.py:send():379] send: header
5
+ 2024-04-26 16:46:17,157 DEBUG SenderThread:307541 [sender.py:send():379] send: run
6
+ 2024-04-26 16:46:17,358 INFO SenderThread:307541 [dir_watcher.py:__init__():211] watching files in: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files
7
+ 2024-04-26 16:46:17,358 INFO SenderThread:307541 [sender.py:_start_run_threads():1124] run started: 71zld9et with start time 1714149977.108705
8
+ 2024-04-26 16:46:17,364 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-04-26 16:46:17,364 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: check_version
10
+ 2024-04-26 16:46:17,420 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-04-26 16:46:17,480 DEBUG HandlerThread:307541 [system_info.py:__init__():26] System info init
12
+ 2024-04-26 16:46:17,480 DEBUG HandlerThread:307541 [system_info.py:__init__():41] System info init done
13
+ 2024-04-26 16:46:17,481 INFO HandlerThread:307541 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-04-26 16:46:17,481 INFO SystemMonitor:307541 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-04-26 16:46:17,481 INFO HandlerThread:307541 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-04-26 16:46:17,481 INFO SystemMonitor:307541 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-04-26 16:46:17,481 INFO SystemMonitor:307541 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-04-26 16:46:17,482 INFO SystemMonitor:307541 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-04-26 16:46:17,482 INFO SystemMonitor:307541 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-04-26 16:46:17,483 INFO SystemMonitor:307541 [interfaces.py:start():190] Started network monitoring
21
+ 2024-04-26 16:46:17,529 DEBUG HandlerThread:307541 [system_info.py:probe():150] Probing system
22
+ 2024-04-26 16:46:17,531 DEBUG HandlerThread:307541 [system_info.py:_probe_git():135] Probing git
23
+ 2024-04-26 16:46:17,552 DEBUG HandlerThread:307541 [system_info.py:_probe_git():143] Probing git done
24
+ 2024-04-26 16:46:17,552 DEBUG HandlerThread:307541 [system_info.py:probe():198] Probing system done
25
+ 2024-04-26 16:46:17,552 DEBUG HandlerThread:307541 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-1048-aws-x86_64-with-glibc2.31', 'python': '3.11.9', 'heartbeatAt': '2024-04-26T16:46:17.529449', 'startedAt': '2024-04-26T16:46:17.086330', 'docker': None, 'cuda': None, 'args': ('./config_200k.yaml',), 'state': 'running', 'program': '/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/./run_dpo.py', 'codePathLocal': 'run_dpo.py', 'codePath': 'run_dpo.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-dpo-ultrafeedback-200k', 'commit': 'ad515d951da0956ace4aee8562ec51c4a6e5f486'}, 'email': None, 'root': '/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k', 'host': 'ip-26-0-161-178', 'username': 'sanchit', 'executable': '/fsx/sanchit/miniconda3/envs/alignment/bin/python', 'cpu_count': 96, 'cpu_count_logical': 96, 'cpu_freq': {'current': 2727.4798437500003, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3538.047, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3538.387, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.515, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.363, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3593.838, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.138, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3574.723, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.054, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 290.7472343444824, 'used': 62.08610534667969}}, 'gpu': 'NVIDIA H100 80GB HBM3', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}], 'memory': {'total': 1999.9855346679688}}
26
+ 2024-04-26 16:46:17,552 INFO HandlerThread:307541 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-04-26 16:46:17,552 INFO HandlerThread:307541 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-04-26 16:46:17,552 DEBUG HandlerThread:307541 [system_info.py:_save_conda():207] Saving list of conda packages installed into the current environment
29
+ 2024-04-26 16:46:18,360 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/conda-environment.yaml
30
+ 2024-04-26 16:46:21,876 DEBUG HandlerThread:307541 [system_info.py:_save_conda():222] Saving conda packages done
31
+ 2024-04-26 16:46:21,878 INFO HandlerThread:307541 [system_monitor.py:probe():229] Finished publishing system info
32
+ 2024-04-26 16:46:21,902 DEBUG SenderThread:307541 [sender.py:send():379] send: files
33
+ 2024-04-26 16:46:21,902 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-metadata.json with policy now
34
+ 2024-04-26 16:46:22,045 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: python_packages
35
+ 2024-04-26 16:46:22,045 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: python_packages
36
+ 2024-04-26 16:46:22,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
37
+ 2024-04-26 16:46:22,047 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
38
+ 2024-04-26 16:46:22,048 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
39
+ 2024-04-26 16:46:22,158 INFO wandb-upload_0:307541 [upload_job.py:push():131] Uploaded file /tmp/tmpyc0gjhuhwandb/b5y5043z-wandb-metadata.json
40
+ 2024-04-26 16:46:22,192 DEBUG SenderThread:307541 [sender.py:send():379] send: telemetry
41
+ 2024-04-26 16:46:22,192 DEBUG SenderThread:307541 [sender.py:send():379] send: config
42
+ 2024-04-26 16:46:22,192 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-04-26 16:46:22,193 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
44
+ 2024-04-26 16:46:22,193 DEBUG SenderThread:307541 [sender.py:send():379] send: telemetry
45
+ 2024-04-26 16:46:22,193 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
46
+ 2024-04-26 16:46:22,193 WARNING SenderThread:307541 [sender.py:send_metric():1341] Seen metric with glob (shouldn't happen)
47
+ 2024-04-26 16:46:22,193 DEBUG SenderThread:307541 [sender.py:send():379] send: telemetry
48
+ 2024-04-26 16:46:22,363 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/conda-environment.yaml
49
+ 2024-04-26 16:46:22,364 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/requirements.txt
50
+ 2024-04-26 16:46:22,364 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
51
+ 2024-04-26 16:46:22,364 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-metadata.json
52
+ 2024-04-26 16:46:24,157 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
53
+ 2024-04-26 16:46:24,160 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
54
+ 2024-04-26 16:46:24,160 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
55
+ 2024-04-26 16:46:24,160 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
56
+ 2024-04-26 16:46:24,160 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
57
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
58
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
59
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
60
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
61
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
62
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
63
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
64
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
65
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: history
66
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
67
+ 2024-04-26 16:46:24,164 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
68
+ 2024-04-26 16:46:24,366 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
69
+ 2024-04-26 16:46:24,366 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
70
+ 2024-04-26 16:46:26,368 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
71
+ 2024-04-26 16:46:27,676 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
72
+ 2024-04-26 16:46:28,370 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
73
+ 2024-04-26 16:46:30,371 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
74
+ 2024-04-26 16:46:32,373 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
75
+ 2024-04-26 16:46:32,742 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
76
+ 2024-04-26 16:46:34,375 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
77
+ 2024-04-26 16:46:36,377 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
78
+ 2024-04-26 16:46:37,045 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
79
+ 2024-04-26 16:46:37,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
80
+ 2024-04-26 16:46:37,047 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
81
+ 2024-04-26 16:46:38,098 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
82
+ 2024-04-26 16:46:38,379 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
83
+ 2024-04-26 16:46:40,381 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
84
+ 2024-04-26 16:46:42,382 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
85
+ 2024-04-26 16:46:43,102 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
86
+ 2024-04-26 16:46:44,384 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
87
+ 2024-04-26 16:46:46,386 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
88
+ 2024-04-26 16:46:48,221 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-04-26 16:46:48,388 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/config.yaml
90
+ 2024-04-26 16:46:48,389 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
91
+ 2024-04-26 16:46:50,390 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
92
+ 2024-04-26 16:46:52,045 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
93
+ 2024-04-26 16:46:52,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
94
+ 2024-04-26 16:46:52,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
95
+ 2024-04-26 16:46:52,392 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
96
+ 2024-04-26 16:46:54,305 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
97
+ 2024-04-26 16:46:54,394 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
98
+ 2024-04-26 16:46:54,619 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
99
+ 2024-04-26 16:46:54,621 DEBUG SenderThread:307541 [sender.py:send():379] send: history
100
+ 2024-04-26 16:46:54,621 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
101
+ 2024-04-26 16:46:54,623 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
102
+ 2024-04-26 16:46:55,395 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
103
+ 2024-04-26 16:46:56,396 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
104
+ 2024-04-26 16:46:58,398 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
105
+ 2024-04-26 16:46:59,425 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
106
+ 2024-04-26 16:47:00,400 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
107
+ 2024-04-26 16:47:02,402 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
108
+ 2024-04-26 16:47:04,404 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
109
+ 2024-04-26 16:47:04,502 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
110
+ 2024-04-26 16:47:06,405 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
111
+ 2024-04-26 16:47:07,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
112
+ 2024-04-26 16:47:07,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
113
+ 2024-04-26 16:47:07,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
114
+ 2024-04-26 16:47:08,407 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
115
+ 2024-04-26 16:47:09,566 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
116
+ 2024-04-26 16:47:10,409 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
117
+ 2024-04-26 16:47:12,411 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
118
+ 2024-04-26 16:47:14,413 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
119
+ 2024-04-26 16:47:14,686 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
120
+ 2024-04-26 16:47:16,415 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
121
+ 2024-04-26 16:47:17,483 DEBUG SystemMonitor:307541 [system_monitor.py:_start():172] Starting system metrics aggregation loop
122
+ 2024-04-26 16:47:17,486 DEBUG SenderThread:307541 [sender.py:send():379] send: stats
123
+ 2024-04-26 16:47:18,416 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
124
+ 2024-04-26 16:47:19,787 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
125
+ 2024-04-26 16:47:20,419 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
126
+ 2024-04-26 16:47:22,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
127
+ 2024-04-26 16:47:22,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
128
+ 2024-04-26 16:47:22,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
129
+ 2024-04-26 16:47:22,421 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
130
+ 2024-04-26 16:47:24,423 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
131
+ 2024-04-26 16:47:24,834 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
132
+ 2024-04-26 16:47:26,424 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
133
+ 2024-04-26 16:47:26,474 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
134
+ 2024-04-26 16:47:26,476 DEBUG SenderThread:307541 [sender.py:send():379] send: history
135
+ 2024-04-26 16:47:26,477 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
136
+ 2024-04-26 16:47:26,478 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
137
+ 2024-04-26 16:47:27,426 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
138
+ 2024-04-26 16:47:28,427 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
139
+ 2024-04-26 16:47:29,999 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
140
+ 2024-04-26 16:47:30,430 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
141
+ 2024-04-26 16:47:32,432 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
142
+ 2024-04-26 16:47:34,433 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
143
+ 2024-04-26 16:47:35,142 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
144
+ 2024-04-26 16:47:36,435 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
145
+ 2024-04-26 16:47:37,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
146
+ 2024-04-26 16:47:37,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
147
+ 2024-04-26 16:47:37,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
148
+ 2024-04-26 16:47:38,437 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
149
+ 2024-04-26 16:47:40,216 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
150
+ 2024-04-26 16:47:40,439 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
151
+ 2024-04-26 16:47:42,441 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
152
+ 2024-04-26 16:47:44,442 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
153
+ 2024-04-26 16:47:45,370 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
154
+ 2024-04-26 16:47:46,444 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
155
+ 2024-04-26 16:47:47,488 DEBUG SenderThread:307541 [sender.py:send():379] send: stats
156
+ 2024-04-26 16:47:48,446 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
157
+ 2024-04-26 16:47:50,431 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
158
+ 2024-04-26 16:47:50,448 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
159
+ 2024-04-26 16:47:52,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
160
+ 2024-04-26 16:47:52,047 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
161
+ 2024-04-26 16:47:52,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
162
+ 2024-04-26 16:47:52,450 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
163
+ 2024-04-26 16:47:54,451 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
164
+ 2024-04-26 16:47:55,581 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
165
+ 2024-04-26 16:47:56,453 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
166
+ 2024-04-26 16:47:58,455 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
167
+ 2024-04-26 16:47:58,481 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
168
+ 2024-04-26 16:47:58,483 DEBUG SenderThread:307541 [sender.py:send():379] send: history
169
+ 2024-04-26 16:47:58,483 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
170
+ 2024-04-26 16:47:58,485 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
171
+ 2024-04-26 16:47:59,456 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
172
+ 2024-04-26 16:48:00,457 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
173
+ 2024-04-26 16:48:00,713 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
174
+ 2024-04-26 16:48:02,459 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
175
+ 2024-04-26 16:48:04,461 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
176
+ 2024-04-26 16:48:05,871 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
177
+ 2024-04-26 16:48:06,464 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
178
+ 2024-04-26 16:48:07,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
179
+ 2024-04-26 16:48:07,047 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
180
+ 2024-04-26 16:48:07,049 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
181
+ 2024-04-26 16:48:08,466 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
182
+ 2024-04-26 16:48:10,470 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
183
+ 2024-04-26 16:48:10,999 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
184
+ 2024-04-26 16:48:12,475 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
185
+ 2024-04-26 16:48:14,479 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
186
+ 2024-04-26 16:48:16,288 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
187
+ 2024-04-26 16:48:16,484 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
188
+ 2024-04-26 16:48:17,490 DEBUG SenderThread:307541 [sender.py:send():379] send: stats
189
+ 2024-04-26 16:48:18,489 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
190
+ 2024-04-26 16:48:20,494 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
191
+ 2024-04-26 16:48:21,395 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
192
+ 2024-04-26 16:48:22,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
193
+ 2024-04-26 16:48:22,047 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
194
+ 2024-04-26 16:48:22,047 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
195
+ 2024-04-26 16:48:22,500 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
196
+ 2024-04-26 16:48:24,504 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
197
+ 2024-04-26 16:48:26,477 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
198
+ 2024-04-26 16:48:26,506 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
199
+ 2024-04-26 16:48:28,508 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
200
+ 2024-04-26 16:48:30,366 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
201
+ 2024-04-26 16:48:30,368 DEBUG SenderThread:307541 [sender.py:send():379] send: history
202
+ 2024-04-26 16:48:30,369 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
203
+ 2024-04-26 16:48:30,370 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
204
+ 2024-04-26 16:48:30,510 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
205
+ 2024-04-26 16:48:30,511 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
206
+ 2024-04-26 16:48:31,653 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
207
+ 2024-04-26 16:48:32,512 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
208
+ 2024-04-26 16:48:34,514 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
209
+ 2024-04-26 16:48:36,516 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
210
+ 2024-04-26 16:48:36,761 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
211
+ 2024-04-26 16:48:37,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
212
+ 2024-04-26 16:48:37,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
213
+ 2024-04-26 16:48:37,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
214
+ 2024-04-26 16:48:38,517 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
215
+ 2024-04-26 16:48:38,642 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
216
+ 2024-04-26 16:48:38,645 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
217
+ 2024-04-26 16:48:38,646 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
218
+ 2024-04-26 16:48:38,647 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
219
+ 2024-04-26 16:48:38,647 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
220
+ 2024-04-26 16:48:38,647 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
221
+ 2024-04-26 16:48:38,647 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
222
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
223
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
224
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
225
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
226
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
227
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
228
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: history
229
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
230
+ 2024-04-26 16:48:38,650 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
231
+ 2024-04-26 16:48:39,519 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
232
+ 2024-04-26 16:48:40,520 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
233
+ 2024-04-26 16:48:42,656 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
234
+ 2024-04-26 16:48:47,492 DEBUG SenderThread:307541 [sender.py:send():379] send: stats
235
+ 2024-04-26 16:48:48,493 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
236
+ 2024-04-26 16:48:48,528 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
237
+ 2024-04-26 16:48:52,631 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
238
+ 2024-04-26 16:48:53,418 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
239
+ 2024-04-26 16:48:53,418 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
240
+ 2024-04-26 16:48:53,567 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
241
+ 2024-04-26 16:48:54,535 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/config.yaml
242
+ 2024-04-26 16:48:58,655 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
243
+ 2024-04-26 16:49:03,655 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
wandb/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-26 16:46:17,096 INFO MainThread:306341 [wandb_setup.py:_flush():76] Current SDK version is 0.16.6
2
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Configure stats pid to 306341
3
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/settings
5
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_dpo.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/run_dpo.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/./run_dpo.py'}
8
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:_log_setup():521] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/logs/debug.log
10
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:_log_setup():522] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/logs/debug-internal.log
11
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:init():561] calling init triggers
12
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:init():568] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:init():611] starting backend
15
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:init():615] setting up manager
16
+ 2024-04-26 16:46:17,102 INFO MainThread:306341 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-04-26 16:46:17,108 INFO MainThread:306341 [wandb_init.py:init():623] backend started and connected
18
+ 2024-04-26 16:46:17,111 INFO MainThread:306341 [wandb_init.py:init():715] updated telemetry
19
+ 2024-04-26 16:46:17,156 INFO MainThread:306341 [wandb_init.py:init():748] communicating run to backend with 90.0 second timeout
20
+ 2024-04-26 16:46:17,363 INFO MainThread:306341 [wandb_run.py:_on_init():2357] communicating current version
21
+ 2024-04-26 16:46:17,413 INFO MainThread:306341 [wandb_run.py:_on_init():2366] got version response
22
+ 2024-04-26 16:46:17,414 INFO MainThread:306341 [wandb_init.py:init():799] starting run threads in backend
23
+ 2024-04-26 16:46:22,046 INFO MainThread:306341 [wandb_run.py:_console_start():2335] atexit reg
24
+ 2024-04-26 16:46:22,046 INFO MainThread:306341 [wandb_run.py:_redirect():2190] redirect: wrap_raw
25
+ 2024-04-26 16:46:22,046 INFO MainThread:306341 [wandb_run.py:_redirect():2255] Wrapping output streams.
26
+ 2024-04-26 16:46:22,046 INFO MainThread:306341 [wandb_run.py:_redirect():2280] Redirects installed.
27
+ 2024-04-26 16:46:22,047 INFO MainThread:306341 [wandb_init.py:init():842] run started, returning control to user process
28
+ 2024-04-26 16:46:22,048 INFO MainThread:306341 [wandb_run.py:_config_callback():1347] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 1024, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k', 'transformers_version': '4.40.1', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 5e-07, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr26_16-38-17_ip-26-0-161-178', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'beta': 0.01, 'hub_model_revision': 'main', 'max_prompt_length': 512, 'loss_type': 'sigmoid'}
wandb/run-20240426_164617-71zld9et/files/conda-environment.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: alignment
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - bzip2=1.0.8=h5eee18b_5
8
+ - ca-certificates=2024.3.11=h06a4308_0
9
+ - ld_impl_linux-64=2.38=h1181459_1
10
+ - libffi=3.4.4=h6a678d5_0
11
+ - libgcc-ng=11.2.0=h1234567_1
12
+ - libgomp=11.2.0=h1234567_1
13
+ - libstdcxx-ng=11.2.0=h1234567_1
14
+ - libuuid=1.41.5=h5eee18b_0
15
+ - ncurses=6.4=h6a678d5_0
16
+ - openssl=3.0.13=h7f8727e_0
17
+ - pip=23.3.1=py311h06a4308_0
18
+ - python=3.11.9=h955ad1f_0
19
+ - readline=8.2=h5eee18b_0
20
+ - setuptools=68.2.2=py311h06a4308_0
21
+ - sqlite=3.41.2=h5eee18b_0
22
+ - tk=8.6.12=h1ccaba5_0
23
+ - wheel=0.41.2=py311h06a4308_0
24
+ - xz=5.4.6=h5eee18b_0
25
+ - zlib=1.2.13=h5eee18b_0
26
+ - pip:
27
+ - absl-py==2.1.0
28
+ - accelerate==0.29.3
29
+ - aiohttp==3.9.5
30
+ - aiosignal==1.3.1
31
+ - annotated-types==0.6.0
32
+ - appdirs==1.4.4
33
+ - attrs==23.2.0
34
+ - bitsandbytes==0.43.1
35
+ - certifi==2024.2.2
36
+ - charset-normalizer==3.3.2
37
+ - click==8.1.7
38
+ - datasets==2.19.0
39
+ - deepspeed==0.14.2
40
+ - dill==0.3.8
41
+ - docker-pycreds==0.4.0
42
+ - docstring-parser==0.16
43
+ - einops==0.7.0
44
+ - evaluate==0.4.1
45
+ - filelock==3.13.4
46
+ - frozenlist==1.4.1
47
+ - fsspec==2024.3.1
48
+ - gitdb==4.0.11
49
+ - gitpython==3.1.43
50
+ - grpcio==1.62.2
51
+ - hf-transfer==0.1.6
52
+ - hjson==3.1.0
53
+ - huggingface-hub==0.22.2
54
+ - idna==3.7
55
+ - jinja2==3.1.3
56
+ - markdown==3.6
57
+ - markdown-it-py==3.0.0
58
+ - markupsafe==2.1.5
59
+ - mdurl==0.1.2
60
+ - mpmath==1.3.0
61
+ - multidict==6.0.5
62
+ - multiprocess==0.70.16
63
+ - networkx==3.3
64
+ - ninja==1.11.1.1
65
+ - numpy==1.26.4
66
+ - nvidia-cublas-cu12==12.1.3.1
67
+ - nvidia-cuda-cupti-cu12==12.1.105
68
+ - nvidia-cuda-nvrtc-cu12==12.1.105
69
+ - nvidia-cuda-runtime-cu12==12.1.105
70
+ - nvidia-cudnn-cu12==8.9.2.26
71
+ - nvidia-cufft-cu12==11.0.2.54
72
+ - nvidia-curand-cu12==10.3.2.106
73
+ - nvidia-cusolver-cu12==11.4.5.107
74
+ - nvidia-cusparse-cu12==12.1.0.106
75
+ - nvidia-nccl-cu12==2.19.3
76
+ - nvidia-nvjitlink-cu12==12.4.127
77
+ - nvidia-nvtx-cu12==12.1.105
78
+ - packaging==24.0
79
+ - pandas==2.2.2
80
+ - peft==0.10.0
81
+ - pillow==10.3.0
82
+ - protobuf==3.20.2
83
+ - psutil==5.9.8
84
+ - py-cpuinfo==9.0.0
85
+ - pyarrow==16.0.0
86
+ - pyarrow-hotfix==0.6
87
+ - pydantic==2.7.1
88
+ - pydantic-core==2.18.2
89
+ - pygments==2.17.2
90
+ - pynvml==11.5.0
91
+ - python-dateutil==2.9.0.post0
92
+ - pytz==2024.1
93
+ - pyyaml==6.0.1
94
+ - regex==2024.4.16
95
+ - requests==2.31.0
96
+ - responses==0.18.0
97
+ - rich==13.7.1
98
+ - safetensors==0.4.3
99
+ - scipy==1.13.0
100
+ - sentencepiece==0.2.0
101
+ - sentry-sdk==2.0.0
102
+ - setproctitle==1.3.3
103
+ - shtab==1.7.1
104
+ - six==1.16.0
105
+ - smmap==5.0.1
106
+ - sympy==1.12
107
+ - tensorboard==2.16.2
108
+ - tensorboard-data-server==0.7.2
109
+ - tokenizers==0.19.1
110
+ - torch==2.2.2
111
+ - torchaudio==2.2.2
112
+ - torchvision==0.17.2
113
+ - tqdm==4.66.2
114
+ - transformers==4.40.1
115
+ - triton==2.2.0
116
+ - trl==0.8.6
117
+ - typing-extensions==4.11.0
118
+ - tyro==0.8.3
119
+ - tzdata==2024.1
120
+ - urllib3==2.2.1
121
+ - wandb==0.16.6
122
+ - werkzeug==3.0.2
123
+ - xxhash==3.4.1
124
+ - yarl==1.9.4
125
+ prefix: /fsx/sanchit/miniconda3/envs/alignment
wandb/run-20240426_164617-71zld9et/files/config.yaml ADDED
@@ -0,0 +1,747 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.11.9
7
+ cli_version: 0.16.6
8
+ framework: huggingface
9
+ huggingface_version: 4.40.1
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1714149977.0
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 11
17
+ - 49
18
+ - 51
19
+ - 55
20
+ - 71
21
+ - 84
22
+ - 98
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 49
27
+ - 51
28
+ - 55
29
+ - 71
30
+ - 84
31
+ - 98
32
+ 3:
33
+ - 7
34
+ - 23
35
+ 4: 3.11.9
36
+ 5: 0.16.6
37
+ 6: 4.40.1
38
+ 8:
39
+ - 5
40
+ 9:
41
+ 1: transformers_trainer
42
+ 13: linux-x86_64
43
+ m:
44
+ - 1: train/global_step
45
+ 6:
46
+ - 3
47
+ - 1: train/loss
48
+ 5: 1
49
+ 6:
50
+ - 1
51
+ - 1: train/grad_norm
52
+ 5: 1
53
+ 6:
54
+ - 1
55
+ - 1: train/learning_rate
56
+ 5: 1
57
+ 6:
58
+ - 1
59
+ - 1: train/rewards/chosen
60
+ 5: 1
61
+ 6:
62
+ - 1
63
+ - 1: train/rewards/rejected
64
+ 5: 1
65
+ 6:
66
+ - 1
67
+ - 1: train/rewards/accuracies
68
+ 5: 1
69
+ 6:
70
+ - 1
71
+ - 1: train/rewards/margins
72
+ 5: 1
73
+ 6:
74
+ - 1
75
+ - 1: train/logps/rejected
76
+ 5: 1
77
+ 6:
78
+ - 1
79
+ - 1: train/logps/chosen
80
+ 5: 1
81
+ 6:
82
+ - 1
83
+ - 1: train/logits/rejected
84
+ 5: 1
85
+ 6:
86
+ - 1
87
+ - 1: train/logits/chosen
88
+ 5: 1
89
+ 6:
90
+ - 1
91
+ - 1: train/epoch
92
+ 5: 1
93
+ 6:
94
+ - 1
95
+ - 1: eval/loss
96
+ 5: 1
97
+ 6:
98
+ - 1
99
+ - 1: eval/runtime
100
+ 5: 1
101
+ 6:
102
+ - 1
103
+ - 1: eval/samples_per_second
104
+ 5: 1
105
+ 6:
106
+ - 1
107
+ - 1: eval/steps_per_second
108
+ 5: 1
109
+ 6:
110
+ - 1
111
+ - 1: eval/rewards/chosen
112
+ 5: 1
113
+ 6:
114
+ - 1
115
+ - 1: eval/rewards/rejected
116
+ 5: 1
117
+ 6:
118
+ - 1
119
+ - 1: eval/rewards/accuracies
120
+ 5: 1
121
+ 6:
122
+ - 1
123
+ - 1: eval/rewards/margins
124
+ 5: 1
125
+ 6:
126
+ - 1
127
+ - 1: eval/logps/rejected
128
+ 5: 1
129
+ 6:
130
+ - 1
131
+ - 1: eval/logps/chosen
132
+ 5: 1
133
+ 6:
134
+ - 1
135
+ - 1: eval/logits/rejected
136
+ 5: 1
137
+ 6:
138
+ - 1
139
+ - 1: eval/logits/chosen
140
+ 5: 1
141
+ 6:
142
+ - 1
143
+ vocab_size:
144
+ desc: null
145
+ value: 32000
146
+ max_position_embeddings:
147
+ desc: null
148
+ value: 32768
149
+ hidden_size:
150
+ desc: null
151
+ value: 4096
152
+ intermediate_size:
153
+ desc: null
154
+ value: 14336
155
+ num_hidden_layers:
156
+ desc: null
157
+ value: 6
158
+ num_attention_heads:
159
+ desc: null
160
+ value: 32
161
+ sliding_window:
162
+ desc: null
163
+ value: 4096
164
+ num_key_value_heads:
165
+ desc: null
166
+ value: 8
167
+ hidden_act:
168
+ desc: null
169
+ value: silu
170
+ initializer_range:
171
+ desc: null
172
+ value: 0.02
173
+ rms_norm_eps:
174
+ desc: null
175
+ value: 1.0e-05
176
+ use_cache:
177
+ desc: null
178
+ value: false
179
+ rope_theta:
180
+ desc: null
181
+ value: 10000.0
182
+ attention_dropout:
183
+ desc: null
184
+ value: 0.0
185
+ return_dict:
186
+ desc: null
187
+ value: true
188
+ output_hidden_states:
189
+ desc: null
190
+ value: false
191
+ output_attentions:
192
+ desc: null
193
+ value: false
194
+ torchscript:
195
+ desc: null
196
+ value: false
197
+ torch_dtype:
198
+ desc: null
199
+ value: null
200
+ use_bfloat16:
201
+ desc: null
202
+ value: false
203
+ tf_legacy_loss:
204
+ desc: null
205
+ value: false
206
+ pruned_heads:
207
+ desc: null
208
+ value: {}
209
+ tie_word_embeddings:
210
+ desc: null
211
+ value: false
212
+ chunk_size_feed_forward:
213
+ desc: null
214
+ value: 0
215
+ is_encoder_decoder:
216
+ desc: null
217
+ value: false
218
+ is_decoder:
219
+ desc: null
220
+ value: false
221
+ cross_attention_hidden_size:
222
+ desc: null
223
+ value: null
224
+ add_cross_attention:
225
+ desc: null
226
+ value: false
227
+ tie_encoder_decoder:
228
+ desc: null
229
+ value: false
230
+ max_length:
231
+ desc: null
232
+ value: 1024
233
+ min_length:
234
+ desc: null
235
+ value: 0
236
+ do_sample:
237
+ desc: null
238
+ value: false
239
+ early_stopping:
240
+ desc: null
241
+ value: false
242
+ num_beams:
243
+ desc: null
244
+ value: 1
245
+ num_beam_groups:
246
+ desc: null
247
+ value: 1
248
+ diversity_penalty:
249
+ desc: null
250
+ value: 0.0
251
+ temperature:
252
+ desc: null
253
+ value: 1.0
254
+ top_k:
255
+ desc: null
256
+ value: 50
257
+ top_p:
258
+ desc: null
259
+ value: 1.0
260
+ typical_p:
261
+ desc: null
262
+ value: 1.0
263
+ repetition_penalty:
264
+ desc: null
265
+ value: 1.0
266
+ length_penalty:
267
+ desc: null
268
+ value: 1.0
269
+ no_repeat_ngram_size:
270
+ desc: null
271
+ value: 0
272
+ encoder_no_repeat_ngram_size:
273
+ desc: null
274
+ value: 0
275
+ bad_words_ids:
276
+ desc: null
277
+ value: null
278
+ num_return_sequences:
279
+ desc: null
280
+ value: 1
281
+ output_scores:
282
+ desc: null
283
+ value: false
284
+ return_dict_in_generate:
285
+ desc: null
286
+ value: false
287
+ forced_bos_token_id:
288
+ desc: null
289
+ value: null
290
+ forced_eos_token_id:
291
+ desc: null
292
+ value: null
293
+ remove_invalid_values:
294
+ desc: null
295
+ value: false
296
+ exponential_decay_length_penalty:
297
+ desc: null
298
+ value: null
299
+ suppress_tokens:
300
+ desc: null
301
+ value: null
302
+ begin_suppress_tokens:
303
+ desc: null
304
+ value: null
305
+ architectures:
306
+ desc: null
307
+ value:
308
+ - MistralForCausalLM
309
+ finetuning_task:
310
+ desc: null
311
+ value: null
312
+ id2label:
313
+ desc: null
314
+ value:
315
+ '0': LABEL_0
316
+ '1': LABEL_1
317
+ label2id:
318
+ desc: null
319
+ value:
320
+ LABEL_0: 0
321
+ LABEL_1: 1
322
+ tokenizer_class:
323
+ desc: null
324
+ value: null
325
+ prefix:
326
+ desc: null
327
+ value: null
328
+ bos_token_id:
329
+ desc: null
330
+ value: 1
331
+ pad_token_id:
332
+ desc: null
333
+ value: null
334
+ eos_token_id:
335
+ desc: null
336
+ value: 2
337
+ sep_token_id:
338
+ desc: null
339
+ value: null
340
+ decoder_start_token_id:
341
+ desc: null
342
+ value: null
343
+ task_specific_params:
344
+ desc: null
345
+ value: null
346
+ problem_type:
347
+ desc: null
348
+ value: null
349
+ _name_or_path:
350
+ desc: null
351
+ value: sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k
352
+ transformers_version:
353
+ desc: null
354
+ value: 4.40.1
355
+ model_type:
356
+ desc: null
357
+ value: mistral
358
+ output_dir:
359
+ desc: null
360
+ value: ./
361
+ overwrite_output_dir:
362
+ desc: null
363
+ value: false
364
+ do_train:
365
+ desc: null
366
+ value: false
367
+ do_eval:
368
+ desc: null
369
+ value: true
370
+ do_predict:
371
+ desc: null
372
+ value: false
373
+ evaluation_strategy:
374
+ desc: null
375
+ value: steps
376
+ prediction_loss_only:
377
+ desc: null
378
+ value: false
379
+ per_device_train_batch_size:
380
+ desc: null
381
+ value: 8
382
+ per_device_eval_batch_size:
383
+ desc: null
384
+ value: 8
385
+ per_gpu_train_batch_size:
386
+ desc: null
387
+ value: null
388
+ per_gpu_eval_batch_size:
389
+ desc: null
390
+ value: null
391
+ gradient_accumulation_steps:
392
+ desc: null
393
+ value: 2
394
+ eval_accumulation_steps:
395
+ desc: null
396
+ value: null
397
+ eval_delay:
398
+ desc: null
399
+ value: 0
400
+ learning_rate:
401
+ desc: null
402
+ value: 5.0e-07
403
+ weight_decay:
404
+ desc: null
405
+ value: 0.0
406
+ adam_beta1:
407
+ desc: null
408
+ value: 0.9
409
+ adam_beta2:
410
+ desc: null
411
+ value: 0.999
412
+ adam_epsilon:
413
+ desc: null
414
+ value: 1.0e-08
415
+ max_grad_norm:
416
+ desc: null
417
+ value: 1.0
418
+ num_train_epochs:
419
+ desc: null
420
+ value: 1
421
+ max_steps:
422
+ desc: null
423
+ value: -1
424
+ lr_scheduler_type:
425
+ desc: null
426
+ value: cosine
427
+ lr_scheduler_kwargs:
428
+ desc: null
429
+ value: {}
430
+ warmup_ratio:
431
+ desc: null
432
+ value: 0.1
433
+ warmup_steps:
434
+ desc: null
435
+ value: 0
436
+ log_level:
437
+ desc: null
438
+ value: info
439
+ log_level_replica:
440
+ desc: null
441
+ value: warning
442
+ log_on_each_node:
443
+ desc: null
444
+ value: true
445
+ logging_dir:
446
+ desc: null
447
+ value: ./runs/Apr26_16-38-17_ip-26-0-161-178
448
+ logging_strategy:
449
+ desc: null
450
+ value: steps
451
+ logging_first_step:
452
+ desc: null
453
+ value: true
454
+ logging_steps:
455
+ desc: null
456
+ value: 25
457
+ logging_nan_inf_filter:
458
+ desc: null
459
+ value: true
460
+ save_strategy:
461
+ desc: null
462
+ value: steps
463
+ save_steps:
464
+ desc: null
465
+ value: 100
466
+ save_total_limit:
467
+ desc: null
468
+ value: 1
469
+ save_safetensors:
470
+ desc: null
471
+ value: true
472
+ save_on_each_node:
473
+ desc: null
474
+ value: false
475
+ save_only_model:
476
+ desc: null
477
+ value: false
478
+ no_cuda:
479
+ desc: null
480
+ value: false
481
+ use_cpu:
482
+ desc: null
483
+ value: false
484
+ use_mps_device:
485
+ desc: null
486
+ value: false
487
+ seed:
488
+ desc: null
489
+ value: 42
490
+ data_seed:
491
+ desc: null
492
+ value: null
493
+ jit_mode_eval:
494
+ desc: null
495
+ value: false
496
+ use_ipex:
497
+ desc: null
498
+ value: false
499
+ bf16:
500
+ desc: null
501
+ value: true
502
+ fp16:
503
+ desc: null
504
+ value: false
505
+ fp16_opt_level:
506
+ desc: null
507
+ value: O1
508
+ half_precision_backend:
509
+ desc: null
510
+ value: auto
511
+ bf16_full_eval:
512
+ desc: null
513
+ value: false
514
+ fp16_full_eval:
515
+ desc: null
516
+ value: false
517
+ tf32:
518
+ desc: null
519
+ value: null
520
+ local_rank:
521
+ desc: null
522
+ value: 0
523
+ ddp_backend:
524
+ desc: null
525
+ value: null
526
+ tpu_num_cores:
527
+ desc: null
528
+ value: null
529
+ tpu_metrics_debug:
530
+ desc: null
531
+ value: false
532
+ debug:
533
+ desc: null
534
+ value: []
535
+ dataloader_drop_last:
536
+ desc: null
537
+ value: false
538
+ eval_steps:
539
+ desc: null
540
+ value: 100
541
+ dataloader_num_workers:
542
+ desc: null
543
+ value: 0
544
+ dataloader_prefetch_factor:
545
+ desc: null
546
+ value: null
547
+ past_index:
548
+ desc: null
549
+ value: -1
550
+ run_name:
551
+ desc: null
552
+ value: ./
553
+ disable_tqdm:
554
+ desc: null
555
+ value: false
556
+ remove_unused_columns:
557
+ desc: null
558
+ value: false
559
+ label_names:
560
+ desc: null
561
+ value: null
562
+ load_best_model_at_end:
563
+ desc: null
564
+ value: false
565
+ metric_for_best_model:
566
+ desc: null
567
+ value: null
568
+ greater_is_better:
569
+ desc: null
570
+ value: null
571
+ ignore_data_skip:
572
+ desc: null
573
+ value: false
574
+ fsdp:
575
+ desc: null
576
+ value: []
577
+ fsdp_min_num_params:
578
+ desc: null
579
+ value: 0
580
+ fsdp_config:
581
+ desc: null
582
+ value:
583
+ min_num_params: 0
584
+ xla: false
585
+ xla_fsdp_v2: false
586
+ xla_fsdp_grad_ckpt: false
587
+ fsdp_transformer_layer_cls_to_wrap:
588
+ desc: null
589
+ value: null
590
+ accelerator_config:
591
+ desc: null
592
+ value:
593
+ split_batches: false
594
+ dispatch_batches: null
595
+ even_batches: true
596
+ use_seedable_sampler: true
597
+ gradient_accumulation_kwargs: null
598
+ deepspeed:
599
+ desc: null
600
+ value: null
601
+ label_smoothing_factor:
602
+ desc: null
603
+ value: 0.0
604
+ optim:
605
+ desc: null
606
+ value: adamw_torch
607
+ optim_args:
608
+ desc: null
609
+ value: null
610
+ adafactor:
611
+ desc: null
612
+ value: false
613
+ group_by_length:
614
+ desc: null
615
+ value: false
616
+ length_column_name:
617
+ desc: null
618
+ value: length
619
+ report_to:
620
+ desc: null
621
+ value:
622
+ - tensorboard
623
+ - wandb
624
+ ddp_find_unused_parameters:
625
+ desc: null
626
+ value: null
627
+ ddp_bucket_cap_mb:
628
+ desc: null
629
+ value: null
630
+ ddp_broadcast_buffers:
631
+ desc: null
632
+ value: null
633
+ dataloader_pin_memory:
634
+ desc: null
635
+ value: true
636
+ dataloader_persistent_workers:
637
+ desc: null
638
+ value: false
639
+ skip_memory_metrics:
640
+ desc: null
641
+ value: true
642
+ use_legacy_prediction_loop:
643
+ desc: null
644
+ value: false
645
+ push_to_hub:
646
+ desc: null
647
+ value: true
648
+ resume_from_checkpoint:
649
+ desc: null
650
+ value: null
651
+ hub_model_id:
652
+ desc: null
653
+ value: null
654
+ hub_strategy:
655
+ desc: null
656
+ value: every_save
657
+ hub_token:
658
+ desc: null
659
+ value: <HUB_TOKEN>
660
+ hub_private_repo:
661
+ desc: null
662
+ value: false
663
+ hub_always_push:
664
+ desc: null
665
+ value: false
666
+ gradient_checkpointing:
667
+ desc: null
668
+ value: true
669
+ gradient_checkpointing_kwargs:
670
+ desc: null
671
+ value:
672
+ use_reentrant: false
673
+ include_inputs_for_metrics:
674
+ desc: null
675
+ value: false
676
+ eval_do_concat_batches:
677
+ desc: null
678
+ value: true
679
+ fp16_backend:
680
+ desc: null
681
+ value: auto
682
+ push_to_hub_model_id:
683
+ desc: null
684
+ value: null
685
+ push_to_hub_organization:
686
+ desc: null
687
+ value: null
688
+ push_to_hub_token:
689
+ desc: null
690
+ value: <PUSH_TO_HUB_TOKEN>
691
+ mp_parameters:
692
+ desc: null
693
+ value: ''
694
+ auto_find_batch_size:
695
+ desc: null
696
+ value: false
697
+ full_determinism:
698
+ desc: null
699
+ value: false
700
+ torchdynamo:
701
+ desc: null
702
+ value: null
703
+ ray_scope:
704
+ desc: null
705
+ value: last
706
+ ddp_timeout:
707
+ desc: null
708
+ value: 1800
709
+ torch_compile:
710
+ desc: null
711
+ value: false
712
+ torch_compile_backend:
713
+ desc: null
714
+ value: null
715
+ torch_compile_mode:
716
+ desc: null
717
+ value: null
718
+ dispatch_batches:
719
+ desc: null
720
+ value: null
721
+ split_batches:
722
+ desc: null
723
+ value: null
724
+ include_tokens_per_second:
725
+ desc: null
726
+ value: false
727
+ include_num_input_tokens_seen:
728
+ desc: null
729
+ value: false
730
+ neftune_noise_alpha:
731
+ desc: null
732
+ value: null
733
+ optim_target_modules:
734
+ desc: null
735
+ value: null
736
+ beta:
737
+ desc: null
738
+ value: 0.01
739
+ hub_model_revision:
740
+ desc: null
741
+ value: main
742
+ max_prompt_length:
743
+ desc: null
744
+ value: 512
745
+ loss_type:
746
+ desc: null
747
+ value: sigmoid
wandb/run-20240426_164617-71zld9et/files/output.log ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ 0%| | 0/478 [00:00<?, ?it/s][WARNING|modeling_utils.py:1188] 2024-04-26 16:46:23,264 >> Could not estimate the number of tokens of the input, floating-point operations will not be computed
3
+ {'loss': 0.6931, 'grad_norm': 5.702331066131592, 'learning_rate': 1.0416666666666666e-08, 'rewards/chosen': 0.0, 'rewards/rejected': 0.0, 'rewards/accuracies': 0.0, 'rewards/margins': 0.0, 'logps/rejected': -328.838134765625, 'logps/chosen': -438.57470703125, 'logits/rejected': -3.2896294593811035, 'logits/chosen': -3.3417840003967285, 'epoch': 0.0}
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+ 5%|▌ | 24/478 [00:31<09:38, 1.27s/it]
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+ 10%|█ | 49/478 [01:03<09:06, 1.27s/it]
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+ 15%|█▌ | 74/478 [01:35<08:37, 1.28s/it]
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+ 21%|██ | 99/478 [02:06<08:01, 1.27s/it]
70
+ 21%|██ | 100/478 [02:08<08:02, 1.28s/it][INFO|trainer.py:3614] 2024-04-26 16:48:30,367 >> ***** Running Evaluation *****
71
+ [INFO|trainer.py:3616] 2024-04-26 16:48:30,367 >> Num examples = 2000
72
+ [INFO|trainer.py:3619] 2024-04-26 16:48:30,367 >> Batch size = 8
73
+
74
+
75
+
76
+ 88%|████████▊ | 28/32 [00:06<00:01, 3.58it/s]
77
+ [INFO|configuration_utils.py:471] 2024-04-26 16:48:38,653 >> Configuration saved in ./checkpoint-100/config.json
78
+ [INFO|configuration_utils.py:697] 2024-04-26 16:48:38,655 >> Configuration saved in ./checkpoint-100/generation_config.json
79
+ [INFO|modeling_utils.py:2598] 2024-04-26 16:48:46,786 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at ./checkpoint-100/model.safetensors.index.json.
80
+ [INFO|tokenization_utils_base.py:2488] 2024-04-26 16:48:46,789 >> tokenizer config file saved in ./checkpoint-100/tokenizer_config.json
wandb/run-20240426_164617-71zld9et/files/requirements.txt ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GitPython==3.1.43
2
+ Jinja2==3.1.3
3
+ Markdown==3.6
4
+ MarkupSafe==2.1.5
5
+ PyYAML==6.0.1
6
+ Pygments==2.17.2
7
+ Werkzeug==3.0.2
8
+ absl-py==2.1.0
9
+ accelerate==0.29.3
10
+ aiohttp==3.9.5
11
+ aiosignal==1.3.1
12
+ alignment-handbook==0.4.0.dev0
13
+ annotated-types==0.6.0
14
+ appdirs==1.4.4
15
+ attrs==23.2.0
16
+ bitsandbytes==0.43.1
17
+ certifi==2024.2.2
18
+ charset-normalizer==3.3.2
19
+ click==8.1.7
20
+ datasets==2.19.0
21
+ deepspeed==0.14.2
22
+ dill==0.3.8
23
+ docker-pycreds==0.4.0
24
+ docstring_parser==0.16
25
+ einops==0.7.0
26
+ evaluate==0.4.1
27
+ filelock==3.13.4
28
+ frozenlist==1.4.1
29
+ fsspec==2024.3.1
30
+ gitdb==4.0.11
31
+ grpcio==1.62.2
32
+ hf_transfer==0.1.6
33
+ hjson==3.1.0
34
+ huggingface-hub==0.22.2
35
+ idna==3.7
36
+ markdown-it-py==3.0.0
37
+ mdurl==0.1.2
38
+ mpmath==1.3.0
39
+ multidict==6.0.5
40
+ multiprocess==0.70.16
41
+ networkx==3.3
42
+ ninja==1.11.1.1
43
+ numpy==1.26.4
44
+ nvidia-cublas-cu12==12.1.3.1
45
+ nvidia-cuda-cupti-cu12==12.1.105
46
+ nvidia-cuda-nvrtc-cu12==12.1.105
47
+ nvidia-cuda-runtime-cu12==12.1.105
48
+ nvidia-cudnn-cu12==8.9.2.26
49
+ nvidia-cufft-cu12==11.0.2.54
50
+ nvidia-curand-cu12==10.3.2.106
51
+ nvidia-cusolver-cu12==11.4.5.107
52
+ nvidia-cusparse-cu12==12.1.0.106
53
+ nvidia-nccl-cu12==2.19.3
54
+ nvidia-nvjitlink-cu12==12.4.127
55
+ nvidia-nvtx-cu12==12.1.105
56
+ packaging==24.0
57
+ pandas==2.2.2
58
+ peft==0.10.0
59
+ pillow==10.3.0
60
+ pip==23.3.1
61
+ protobuf==3.20.2
62
+ psutil==5.9.8
63
+ py-cpuinfo==9.0.0
64
+ pyarrow-hotfix==0.6
65
+ pyarrow==16.0.0
66
+ pydantic==2.7.1
67
+ pydantic_core==2.18.2
68
+ pynvml==11.5.0
69
+ python-dateutil==2.9.0.post0
70
+ pytz==2024.1
71
+ regex==2024.4.16
72
+ requests==2.31.0
73
+ responses==0.18.0
74
+ rich==13.7.1
75
+ safetensors==0.4.3
76
+ scipy==1.13.0
77
+ sentencepiece==0.2.0
78
+ sentry-sdk==2.0.0
79
+ setproctitle==1.3.3
80
+ setuptools==68.2.2
81
+ shtab==1.7.1
82
+ six==1.16.0
83
+ smmap==5.0.1
84
+ sympy==1.12
85
+ tensorboard-data-server==0.7.2
86
+ tensorboard==2.16.2
87
+ tokenizers==0.19.1
88
+ torch==2.2.2
89
+ torchaudio==2.2.2
90
+ torchvision==0.17.2
91
+ tqdm==4.66.2
92
+ transformers==4.40.1
93
+ triton==2.2.0
94
+ trl==0.8.6
95
+ typing_extensions==4.11.0
96
+ tyro==0.8.3
97
+ tzdata==2024.1
98
+ urllib3==2.2.1
99
+ wandb==0.16.6
100
+ wheel==0.41.2
101
+ xxhash==3.4.1
102
+ yarl==1.9.4
wandb/run-20240426_164617-71zld9et/files/wandb-metadata.json ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
3
+ "python": "3.11.9",
4
+ "heartbeatAt": "2024-04-26T16:46:17.529449",
5
+ "startedAt": "2024-04-26T16:46:17.086330",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "./config_200k.yaml"
10
+ ],
11
+ "state": "running",
12
+ "program": "/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/./run_dpo.py",
13
+ "codePathLocal": "run_dpo.py",
14
+ "codePath": "run_dpo.py",
15
+ "git": {
16
+ "remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-dpo-ultrafeedback-200k",
17
+ "commit": "ad515d951da0956ace4aee8562ec51c4a6e5f486"
18
+ },
19
+ "email": null,
20
+ "root": "/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k",
21
+ "host": "ip-26-0-161-178",
22
+ "username": "sanchit",
23
+ "executable": "/fsx/sanchit/miniconda3/envs/alignment/bin/python",
24
+ "cpu_count": 96,
25
+ "cpu_count_logical": 96,
26
+ "cpu_freq": {
27
+ "current": 2727.4798437500003,
28
+ "min": 0.0,
29
+ "max": 0.0
30
+ },
31
+ "cpu_freq_per_core": [
32
+ {
33
+ "current": 2650.0,
34
+ "min": 0.0,
35
+ "max": 0.0
36
+ },
37
+ {
38
+ "current": 2650.0,
39
+ "min": 0.0,
40
+ "max": 0.0
41
+ },
42
+ {
43
+ "current": 2650.0,
44
+ "min": 0.0,
45
+ "max": 0.0
46
+ },
47
+ {
48
+ "current": 2650.0,
49
+ "min": 0.0,
50
+ "max": 0.0
51
+ },
52
+ {
53
+ "current": 2650.0,
54
+ "min": 0.0,
55
+ "max": 0.0
56
+ },
57
+ {
58
+ "current": 2650.0,
59
+ "min": 0.0,
60
+ "max": 0.0
61
+ },
62
+ {
63
+ "current": 2650.0,
64
+ "min": 0.0,
65
+ "max": 0.0
66
+ },
67
+ {
68
+ "current": 2650.0,
69
+ "min": 0.0,
70
+ "max": 0.0
71
+ },
72
+ {
73
+ "current": 2650.0,
74
+ "min": 0.0,
75
+ "max": 0.0
76
+ },
77
+ {
78
+ "current": 2650.0,
79
+ "min": 0.0,
80
+ "max": 0.0
81
+ },
82
+ {
83
+ "current": 2650.0,
84
+ "min": 0.0,
85
+ "max": 0.0
86
+ },
87
+ {
88
+ "current": 3538.047,
89
+ "min": 0.0,
90
+ "max": 0.0
91
+ },
92
+ {
93
+ "current": 2650.0,
94
+ "min": 0.0,
95
+ "max": 0.0
96
+ },
97
+ {
98
+ "current": 2650.0,
99
+ "min": 0.0,
100
+ "max": 0.0
101
+ },
102
+ {
103
+ "current": 2650.0,
104
+ "min": 0.0,
105
+ "max": 0.0
106
+ },
107
+ {
108
+ "current": 2650.0,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2650.0,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2650.0,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2650.0,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2650.0,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2650.0,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2650.0,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2650.0,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2650.0,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2650.0,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2650.0,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2650.0,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2650.0,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2650.0,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2650.0,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2650.0,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2650.0,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2650.0,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2650.0,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ },
202
+ {
203
+ "current": 2650.0,
204
+ "min": 0.0,
205
+ "max": 0.0
206
+ },
207
+ {
208
+ "current": 2650.0,
209
+ "min": 0.0,
210
+ "max": 0.0
211
+ },
212
+ {
213
+ "current": 2650.0,
214
+ "min": 0.0,
215
+ "max": 0.0
216
+ },
217
+ {
218
+ "current": 2650.0,
219
+ "min": 0.0,
220
+ "max": 0.0
221
+ },
222
+ {
223
+ "current": 2650.0,
224
+ "min": 0.0,
225
+ "max": 0.0
226
+ },
227
+ {
228
+ "current": 2650.0,
229
+ "min": 0.0,
230
+ "max": 0.0
231
+ },
232
+ {
233
+ "current": 3538.387,
234
+ "min": 0.0,
235
+ "max": 0.0
236
+ },
237
+ {
238
+ "current": 2650.0,
239
+ "min": 0.0,
240
+ "max": 0.0
241
+ },
242
+ {
243
+ "current": 2650.0,
244
+ "min": 0.0,
245
+ "max": 0.0
246
+ },
247
+ {
248
+ "current": 2650.0,
249
+ "min": 0.0,
250
+ "max": 0.0
251
+ },
252
+ {
253
+ "current": 2650.0,
254
+ "min": 0.0,
255
+ "max": 0.0
256
+ },
257
+ {
258
+ "current": 2650.0,
259
+ "min": 0.0,
260
+ "max": 0.0
261
+ },
262
+ {
263
+ "current": 2650.0,
264
+ "min": 0.0,
265
+ "max": 0.0
266
+ },
267
+ {
268
+ "current": 2650.0,
269
+ "min": 0.0,
270
+ "max": 0.0
271
+ },
272
+ {
273
+ "current": 2650.0,
274
+ "min": 0.0,
275
+ "max": 0.0
276
+ },
277
+ {
278
+ "current": 2650.0,
279
+ "min": 0.0,
280
+ "max": 0.0
281
+ },
282
+ {
283
+ "current": 3598.515,
284
+ "min": 0.0,
285
+ "max": 0.0
286
+ },
287
+ {
288
+ "current": 2650.0,
289
+ "min": 0.0,
290
+ "max": 0.0
291
+ },
292
+ {
293
+ "current": 2650.0,
294
+ "min": 0.0,
295
+ "max": 0.0
296
+ },
297
+ {
298
+ "current": 2650.0,
299
+ "min": 0.0,
300
+ "max": 0.0
301
+ },
302
+ {
303
+ "current": 2650.0,
304
+ "min": 0.0,
305
+ "max": 0.0
306
+ },
307
+ {
308
+ "current": 2650.0,
309
+ "min": 0.0,
310
+ "max": 0.0
311
+ },
312
+ {
313
+ "current": 2650.0,
314
+ "min": 0.0,
315
+ "max": 0.0
316
+ },
317
+ {
318
+ "current": 2650.0,
319
+ "min": 0.0,
320
+ "max": 0.0
321
+ },
322
+ {
323
+ "current": 2650.0,
324
+ "min": 0.0,
325
+ "max": 0.0
326
+ },
327
+ {
328
+ "current": 2650.0,
329
+ "min": 0.0,
330
+ "max": 0.0
331
+ },
332
+ {
333
+ "current": 2650.0,
334
+ "min": 0.0,
335
+ "max": 0.0
336
+ },
337
+ {
338
+ "current": 3598.363,
339
+ "min": 0.0,
340
+ "max": 0.0
341
+ },
342
+ {
343
+ "current": 2650.0,
344
+ "min": 0.0,
345
+ "max": 0.0
346
+ },
347
+ {
348
+ "current": 2650.0,
349
+ "min": 0.0,
350
+ "max": 0.0
351
+ },
352
+ {
353
+ "current": 3593.838,
354
+ "min": 0.0,
355
+ "max": 0.0
356
+ },
357
+ {
358
+ "current": 2650.0,
359
+ "min": 0.0,
360
+ "max": 0.0
361
+ },
362
+ {
363
+ "current": 2650.0,
364
+ "min": 0.0,
365
+ "max": 0.0
366
+ },
367
+ {
368
+ "current": 2650.0,
369
+ "min": 0.0,
370
+ "max": 0.0
371
+ },
372
+ {
373
+ "current": 2650.0,
374
+ "min": 0.0,
375
+ "max": 0.0
376
+ },
377
+ {
378
+ "current": 2650.0,
379
+ "min": 0.0,
380
+ "max": 0.0
381
+ },
382
+ {
383
+ "current": 2650.0,
384
+ "min": 0.0,
385
+ "max": 0.0
386
+ },
387
+ {
388
+ "current": 2650.0,
389
+ "min": 0.0,
390
+ "max": 0.0
391
+ },
392
+ {
393
+ "current": 3598.138,
394
+ "min": 0.0,
395
+ "max": 0.0
396
+ },
397
+ {
398
+ "current": 2650.0,
399
+ "min": 0.0,
400
+ "max": 0.0
401
+ },
402
+ {
403
+ "current": 2650.0,
404
+ "min": 0.0,
405
+ "max": 0.0
406
+ },
407
+ {
408
+ "current": 2650.0,
409
+ "min": 0.0,
410
+ "max": 0.0
411
+ },
412
+ {
413
+ "current": 2650.0,
414
+ "min": 0.0,
415
+ "max": 0.0
416
+ },
417
+ {
418
+ "current": 2650.0,
419
+ "min": 0.0,
420
+ "max": 0.0
421
+ },
422
+ {
423
+ "current": 2650.0,
424
+ "min": 0.0,
425
+ "max": 0.0
426
+ },
427
+ {
428
+ "current": 2650.0,
429
+ "min": 0.0,
430
+ "max": 0.0
431
+ },
432
+ {
433
+ "current": 2650.0,
434
+ "min": 0.0,
435
+ "max": 0.0
436
+ },
437
+ {
438
+ "current": 3574.723,
439
+ "min": 0.0,
440
+ "max": 0.0
441
+ },
442
+ {
443
+ "current": 2650.0,
444
+ "min": 0.0,
445
+ "max": 0.0
446
+ },
447
+ {
448
+ "current": 2650.0,
449
+ "min": 0.0,
450
+ "max": 0.0
451
+ },
452
+ {
453
+ "current": 2650.0,
454
+ "min": 0.0,
455
+ "max": 0.0
456
+ },
457
+ {
458
+ "current": 2650.0,
459
+ "min": 0.0,
460
+ "max": 0.0
461
+ },
462
+ {
463
+ "current": 2650.0,
464
+ "min": 0.0,
465
+ "max": 0.0
466
+ },
467
+ {
468
+ "current": 2650.0,
469
+ "min": 0.0,
470
+ "max": 0.0
471
+ },
472
+ {
473
+ "current": 2650.0,
474
+ "min": 0.0,
475
+ "max": 0.0
476
+ },
477
+ {
478
+ "current": 2650.0,
479
+ "min": 0.0,
480
+ "max": 0.0
481
+ },
482
+ {
483
+ "current": 2650.0,
484
+ "min": 0.0,
485
+ "max": 0.0
486
+ },
487
+ {
488
+ "current": 2650.0,
489
+ "min": 0.0,
490
+ "max": 0.0
491
+ },
492
+ {
493
+ "current": 2650.0,
494
+ "min": 0.0,
495
+ "max": 0.0
496
+ },
497
+ {
498
+ "current": 3598.054,
499
+ "min": 0.0,
500
+ "max": 0.0
501
+ },
502
+ {
503
+ "current": 2650.0,
504
+ "min": 0.0,
505
+ "max": 0.0
506
+ },
507
+ {
508
+ "current": 2650.0,
509
+ "min": 0.0,
510
+ "max": 0.0
511
+ }
512
+ ],
513
+ "disk": {
514
+ "/": {
515
+ "total": 290.7472343444824,
516
+ "used": 62.08610534667969
517
+ }
518
+ },
519
+ "gpu": "NVIDIA H100 80GB HBM3",
520
+ "gpu_count": 8,
521
+ "gpu_devices": [
522
+ {
523
+ "name": "NVIDIA H100 80GB HBM3",
524
+ "memory_total": 85520809984
525
+ },
526
+ {
527
+ "name": "NVIDIA H100 80GB HBM3",
528
+ "memory_total": 85520809984
529
+ },
530
+ {
531
+ "name": "NVIDIA H100 80GB HBM3",
532
+ "memory_total": 85520809984
533
+ },
534
+ {
535
+ "name": "NVIDIA H100 80GB HBM3",
536
+ "memory_total": 85520809984
537
+ },
538
+ {
539
+ "name": "NVIDIA H100 80GB HBM3",
540
+ "memory_total": 85520809984
541
+ },
542
+ {
543
+ "name": "NVIDIA H100 80GB HBM3",
544
+ "memory_total": 85520809984
545
+ },
546
+ {
547
+ "name": "NVIDIA H100 80GB HBM3",
548
+ "memory_total": 85520809984
549
+ },
550
+ {
551
+ "name": "NVIDIA H100 80GB HBM3",
552
+ "memory_total": 85520809984
553
+ }
554
+ ],
555
+ "memory": {
556
+ "total": 1999.9855346679688
557
+ }
558
+ }
wandb/run-20240426_164617-71zld9et/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 0.6832, "train/grad_norm": 5.553352355957031, "train/learning_rate": 4.821741763807186e-07, "train/rewards/chosen": -0.023883424699306488, "train/rewards/rejected": -0.04398660734295845, "train/rewards/accuracies": 0.6175000071525574, "train/rewards/margins": 0.020103182643651962, "train/logps/rejected": -390.61370849609375, "train/logps/chosen": -430.1453857421875, "train/logits/rejected": -3.442944049835205, "train/logits/chosen": -3.469045400619507, "train/epoch": 0.20920502092050208, "train/global_step": 100, "_timestamp": 1714150118.6419067, "_runtime": 141.53320169448853, "_step": 5, "eval/loss": 0.6805875301361084, "eval/runtime": 8.2626, "eval/samples_per_second": 242.055, "eval/steps_per_second": 3.873, "eval/rewards/chosen": -0.03507506474852562, "eval/rewards/rejected": -0.06189027801156044, "eval/rewards/accuracies": 0.63671875, "eval/rewards/margins": 0.02681521140038967, "eval/logps/rejected": -402.3321533203125, "eval/logps/chosen": -424.9571838378906, "eval/logits/rejected": -3.494518756866455, "eval/logits/chosen": -3.529766798019409}
wandb/run-20240426_164617-71zld9et/logs/debug-internal.log ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-26 16:46:17,106 INFO StreamThr :307541 [internal.py:wandb_internal():86] W&B internal server running at pid: 307541, started at: 2024-04-26 16:46:17.104548
2
+ 2024-04-26 16:46:17,107 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status
3
+ 2024-04-26 16:46:17,112 INFO WriterThread:307541 [datastore.py:open_for_write():87] open: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/run-71zld9et.wandb
4
+ 2024-04-26 16:46:17,113 DEBUG SenderThread:307541 [sender.py:send():379] send: header
5
+ 2024-04-26 16:46:17,157 DEBUG SenderThread:307541 [sender.py:send():379] send: run
6
+ 2024-04-26 16:46:17,358 INFO SenderThread:307541 [dir_watcher.py:__init__():211] watching files in: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files
7
+ 2024-04-26 16:46:17,358 INFO SenderThread:307541 [sender.py:_start_run_threads():1124] run started: 71zld9et with start time 1714149977.108705
8
+ 2024-04-26 16:46:17,364 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-04-26 16:46:17,364 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: check_version
10
+ 2024-04-26 16:46:17,420 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-04-26 16:46:17,480 DEBUG HandlerThread:307541 [system_info.py:__init__():26] System info init
12
+ 2024-04-26 16:46:17,480 DEBUG HandlerThread:307541 [system_info.py:__init__():41] System info init done
13
+ 2024-04-26 16:46:17,481 INFO HandlerThread:307541 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-04-26 16:46:17,481 INFO SystemMonitor:307541 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-04-26 16:46:17,481 INFO HandlerThread:307541 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-04-26 16:46:17,481 INFO SystemMonitor:307541 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-04-26 16:46:17,481 INFO SystemMonitor:307541 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-04-26 16:46:17,482 INFO SystemMonitor:307541 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-04-26 16:46:17,482 INFO SystemMonitor:307541 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-04-26 16:46:17,483 INFO SystemMonitor:307541 [interfaces.py:start():190] Started network monitoring
21
+ 2024-04-26 16:46:17,529 DEBUG HandlerThread:307541 [system_info.py:probe():150] Probing system
22
+ 2024-04-26 16:46:17,531 DEBUG HandlerThread:307541 [system_info.py:_probe_git():135] Probing git
23
+ 2024-04-26 16:46:17,552 DEBUG HandlerThread:307541 [system_info.py:_probe_git():143] Probing git done
24
+ 2024-04-26 16:46:17,552 DEBUG HandlerThread:307541 [system_info.py:probe():198] Probing system done
25
+ 2024-04-26 16:46:17,552 DEBUG HandlerThread:307541 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-1048-aws-x86_64-with-glibc2.31', 'python': '3.11.9', 'heartbeatAt': '2024-04-26T16:46:17.529449', 'startedAt': '2024-04-26T16:46:17.086330', 'docker': None, 'cuda': None, 'args': ('./config_200k.yaml',), 'state': 'running', 'program': '/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/./run_dpo.py', 'codePathLocal': 'run_dpo.py', 'codePath': 'run_dpo.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-dpo-ultrafeedback-200k', 'commit': 'ad515d951da0956ace4aee8562ec51c4a6e5f486'}, 'email': None, 'root': '/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k', 'host': 'ip-26-0-161-178', 'username': 'sanchit', 'executable': '/fsx/sanchit/miniconda3/envs/alignment/bin/python', 'cpu_count': 96, 'cpu_count_logical': 96, 'cpu_freq': {'current': 2727.4798437500003, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3538.047, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3538.387, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.515, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.363, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3593.838, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.138, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3574.723, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.054, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 290.7472343444824, 'used': 62.08610534667969}}, 'gpu': 'NVIDIA H100 80GB HBM3', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}], 'memory': {'total': 1999.9855346679688}}
26
+ 2024-04-26 16:46:17,552 INFO HandlerThread:307541 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-04-26 16:46:17,552 INFO HandlerThread:307541 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-04-26 16:46:17,552 DEBUG HandlerThread:307541 [system_info.py:_save_conda():207] Saving list of conda packages installed into the current environment
29
+ 2024-04-26 16:46:18,360 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/conda-environment.yaml
30
+ 2024-04-26 16:46:21,876 DEBUG HandlerThread:307541 [system_info.py:_save_conda():222] Saving conda packages done
31
+ 2024-04-26 16:46:21,878 INFO HandlerThread:307541 [system_monitor.py:probe():229] Finished publishing system info
32
+ 2024-04-26 16:46:21,902 DEBUG SenderThread:307541 [sender.py:send():379] send: files
33
+ 2024-04-26 16:46:21,902 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-metadata.json with policy now
34
+ 2024-04-26 16:46:22,045 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: python_packages
35
+ 2024-04-26 16:46:22,045 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: python_packages
36
+ 2024-04-26 16:46:22,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
37
+ 2024-04-26 16:46:22,047 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
38
+ 2024-04-26 16:46:22,048 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
39
+ 2024-04-26 16:46:22,158 INFO wandb-upload_0:307541 [upload_job.py:push():131] Uploaded file /tmp/tmpyc0gjhuhwandb/b5y5043z-wandb-metadata.json
40
+ 2024-04-26 16:46:22,192 DEBUG SenderThread:307541 [sender.py:send():379] send: telemetry
41
+ 2024-04-26 16:46:22,192 DEBUG SenderThread:307541 [sender.py:send():379] send: config
42
+ 2024-04-26 16:46:22,192 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
43
+ 2024-04-26 16:46:22,193 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
44
+ 2024-04-26 16:46:22,193 DEBUG SenderThread:307541 [sender.py:send():379] send: telemetry
45
+ 2024-04-26 16:46:22,193 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
46
+ 2024-04-26 16:46:22,193 WARNING SenderThread:307541 [sender.py:send_metric():1341] Seen metric with glob (shouldn't happen)
47
+ 2024-04-26 16:46:22,193 DEBUG SenderThread:307541 [sender.py:send():379] send: telemetry
48
+ 2024-04-26 16:46:22,363 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/conda-environment.yaml
49
+ 2024-04-26 16:46:22,364 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/requirements.txt
50
+ 2024-04-26 16:46:22,364 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
51
+ 2024-04-26 16:46:22,364 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-metadata.json
52
+ 2024-04-26 16:46:24,157 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
53
+ 2024-04-26 16:46:24,160 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
54
+ 2024-04-26 16:46:24,160 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
55
+ 2024-04-26 16:46:24,160 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
56
+ 2024-04-26 16:46:24,160 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
57
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
58
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
59
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
60
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
61
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
62
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
63
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
64
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
65
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send():379] send: history
66
+ 2024-04-26 16:46:24,161 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
67
+ 2024-04-26 16:46:24,164 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
68
+ 2024-04-26 16:46:24,366 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
69
+ 2024-04-26 16:46:24,366 INFO Thread-12 :307541 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
70
+ 2024-04-26 16:46:26,368 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
71
+ 2024-04-26 16:46:27,676 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
72
+ 2024-04-26 16:46:28,370 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
73
+ 2024-04-26 16:46:30,371 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
74
+ 2024-04-26 16:46:32,373 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
75
+ 2024-04-26 16:46:32,742 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
76
+ 2024-04-26 16:46:34,375 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
77
+ 2024-04-26 16:46:36,377 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
78
+ 2024-04-26 16:46:37,045 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
79
+ 2024-04-26 16:46:37,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
80
+ 2024-04-26 16:46:37,047 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
81
+ 2024-04-26 16:46:38,098 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
82
+ 2024-04-26 16:46:38,379 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
83
+ 2024-04-26 16:46:40,381 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
84
+ 2024-04-26 16:46:42,382 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
85
+ 2024-04-26 16:46:43,102 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
86
+ 2024-04-26 16:46:44,384 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
87
+ 2024-04-26 16:46:46,386 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
88
+ 2024-04-26 16:46:48,221 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
89
+ 2024-04-26 16:46:48,388 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/config.yaml
90
+ 2024-04-26 16:46:48,389 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
91
+ 2024-04-26 16:46:50,390 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
92
+ 2024-04-26 16:46:52,045 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
93
+ 2024-04-26 16:46:52,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
94
+ 2024-04-26 16:46:52,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
95
+ 2024-04-26 16:46:52,392 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
96
+ 2024-04-26 16:46:54,305 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
97
+ 2024-04-26 16:46:54,394 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
98
+ 2024-04-26 16:46:54,619 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
99
+ 2024-04-26 16:46:54,621 DEBUG SenderThread:307541 [sender.py:send():379] send: history
100
+ 2024-04-26 16:46:54,621 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
101
+ 2024-04-26 16:46:54,623 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
102
+ 2024-04-26 16:46:55,395 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
103
+ 2024-04-26 16:46:56,396 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
104
+ 2024-04-26 16:46:58,398 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
105
+ 2024-04-26 16:46:59,425 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
106
+ 2024-04-26 16:47:00,400 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
107
+ 2024-04-26 16:47:02,402 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
108
+ 2024-04-26 16:47:04,404 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
109
+ 2024-04-26 16:47:04,502 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
110
+ 2024-04-26 16:47:06,405 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
111
+ 2024-04-26 16:47:07,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
112
+ 2024-04-26 16:47:07,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
113
+ 2024-04-26 16:47:07,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
114
+ 2024-04-26 16:47:08,407 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
115
+ 2024-04-26 16:47:09,566 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
116
+ 2024-04-26 16:47:10,409 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
117
+ 2024-04-26 16:47:12,411 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
118
+ 2024-04-26 16:47:14,413 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
119
+ 2024-04-26 16:47:14,686 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
120
+ 2024-04-26 16:47:16,415 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
121
+ 2024-04-26 16:47:17,483 DEBUG SystemMonitor:307541 [system_monitor.py:_start():172] Starting system metrics aggregation loop
122
+ 2024-04-26 16:47:17,486 DEBUG SenderThread:307541 [sender.py:send():379] send: stats
123
+ 2024-04-26 16:47:18,416 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
124
+ 2024-04-26 16:47:19,787 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
125
+ 2024-04-26 16:47:20,419 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
126
+ 2024-04-26 16:47:22,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
127
+ 2024-04-26 16:47:22,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
128
+ 2024-04-26 16:47:22,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
129
+ 2024-04-26 16:47:22,421 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
130
+ 2024-04-26 16:47:24,423 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
131
+ 2024-04-26 16:47:24,834 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
132
+ 2024-04-26 16:47:26,424 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
133
+ 2024-04-26 16:47:26,474 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
134
+ 2024-04-26 16:47:26,476 DEBUG SenderThread:307541 [sender.py:send():379] send: history
135
+ 2024-04-26 16:47:26,477 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
136
+ 2024-04-26 16:47:26,478 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
137
+ 2024-04-26 16:47:27,426 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
138
+ 2024-04-26 16:47:28,427 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
139
+ 2024-04-26 16:47:29,999 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
140
+ 2024-04-26 16:47:30,430 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
141
+ 2024-04-26 16:47:32,432 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
142
+ 2024-04-26 16:47:34,433 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
143
+ 2024-04-26 16:47:35,142 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
144
+ 2024-04-26 16:47:36,435 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
145
+ 2024-04-26 16:47:37,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
146
+ 2024-04-26 16:47:37,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
147
+ 2024-04-26 16:47:37,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
148
+ 2024-04-26 16:47:38,437 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
149
+ 2024-04-26 16:47:40,216 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
150
+ 2024-04-26 16:47:40,439 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
151
+ 2024-04-26 16:47:42,441 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
152
+ 2024-04-26 16:47:44,442 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
153
+ 2024-04-26 16:47:45,370 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
154
+ 2024-04-26 16:47:46,444 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
155
+ 2024-04-26 16:47:47,488 DEBUG SenderThread:307541 [sender.py:send():379] send: stats
156
+ 2024-04-26 16:47:48,446 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
157
+ 2024-04-26 16:47:50,431 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
158
+ 2024-04-26 16:47:50,448 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
159
+ 2024-04-26 16:47:52,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
160
+ 2024-04-26 16:47:52,047 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
161
+ 2024-04-26 16:47:52,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
162
+ 2024-04-26 16:47:52,450 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
163
+ 2024-04-26 16:47:54,451 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
164
+ 2024-04-26 16:47:55,581 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
165
+ 2024-04-26 16:47:56,453 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
166
+ 2024-04-26 16:47:58,455 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
167
+ 2024-04-26 16:47:58,481 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
168
+ 2024-04-26 16:47:58,483 DEBUG SenderThread:307541 [sender.py:send():379] send: history
169
+ 2024-04-26 16:47:58,483 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
170
+ 2024-04-26 16:47:58,485 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
171
+ 2024-04-26 16:47:59,456 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
172
+ 2024-04-26 16:48:00,457 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
173
+ 2024-04-26 16:48:00,713 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
174
+ 2024-04-26 16:48:02,459 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
175
+ 2024-04-26 16:48:04,461 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
176
+ 2024-04-26 16:48:05,871 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
177
+ 2024-04-26 16:48:06,464 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
178
+ 2024-04-26 16:48:07,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
179
+ 2024-04-26 16:48:07,047 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
180
+ 2024-04-26 16:48:07,049 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
181
+ 2024-04-26 16:48:08,466 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
182
+ 2024-04-26 16:48:10,470 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
183
+ 2024-04-26 16:48:10,999 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
184
+ 2024-04-26 16:48:12,475 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
185
+ 2024-04-26 16:48:14,479 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
186
+ 2024-04-26 16:48:16,288 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
187
+ 2024-04-26 16:48:16,484 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
188
+ 2024-04-26 16:48:17,490 DEBUG SenderThread:307541 [sender.py:send():379] send: stats
189
+ 2024-04-26 16:48:18,489 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
190
+ 2024-04-26 16:48:20,494 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
191
+ 2024-04-26 16:48:21,395 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
192
+ 2024-04-26 16:48:22,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
193
+ 2024-04-26 16:48:22,047 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
194
+ 2024-04-26 16:48:22,047 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
195
+ 2024-04-26 16:48:22,500 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
196
+ 2024-04-26 16:48:24,504 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
197
+ 2024-04-26 16:48:26,477 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
198
+ 2024-04-26 16:48:26,506 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
199
+ 2024-04-26 16:48:28,508 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
200
+ 2024-04-26 16:48:30,366 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
201
+ 2024-04-26 16:48:30,368 DEBUG SenderThread:307541 [sender.py:send():379] send: history
202
+ 2024-04-26 16:48:30,369 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
203
+ 2024-04-26 16:48:30,370 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
204
+ 2024-04-26 16:48:30,510 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
205
+ 2024-04-26 16:48:30,511 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
206
+ 2024-04-26 16:48:31,653 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
207
+ 2024-04-26 16:48:32,512 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
208
+ 2024-04-26 16:48:34,514 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
209
+ 2024-04-26 16:48:36,516 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
210
+ 2024-04-26 16:48:36,761 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
211
+ 2024-04-26 16:48:37,046 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
212
+ 2024-04-26 16:48:37,046 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
213
+ 2024-04-26 16:48:37,048 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
214
+ 2024-04-26 16:48:38,517 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
215
+ 2024-04-26 16:48:38,642 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: partial_history
216
+ 2024-04-26 16:48:38,645 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
217
+ 2024-04-26 16:48:38,646 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
218
+ 2024-04-26 16:48:38,647 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
219
+ 2024-04-26 16:48:38,647 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
220
+ 2024-04-26 16:48:38,647 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
221
+ 2024-04-26 16:48:38,647 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
222
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
223
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
224
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
225
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
226
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
227
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: metric
228
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send():379] send: history
229
+ 2024-04-26 16:48:38,648 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: summary_record
230
+ 2024-04-26 16:48:38,650 INFO SenderThread:307541 [sender.py:_save_file():1390] saving file wandb-summary.json with policy end
231
+ 2024-04-26 16:48:39,519 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/wandb-summary.json
232
+ 2024-04-26 16:48:40,520 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
233
+ 2024-04-26 16:48:42,656 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
234
+ 2024-04-26 16:48:47,492 DEBUG SenderThread:307541 [sender.py:send():379] send: stats
235
+ 2024-04-26 16:48:48,493 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
236
+ 2024-04-26 16:48:48,528 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/output.log
237
+ 2024-04-26 16:48:52,631 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: internal_messages
238
+ 2024-04-26 16:48:53,418 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: stop_status
239
+ 2024-04-26 16:48:53,418 DEBUG SenderThread:307541 [sender.py:send_request():406] send_request: stop_status
240
+ 2024-04-26 16:48:53,567 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
241
+ 2024-04-26 16:48:54,535 INFO Thread-12 :307541 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/files/config.yaml
242
+ 2024-04-26 16:48:58,655 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
243
+ 2024-04-26 16:49:03,655 DEBUG HandlerThread:307541 [handler.py:handle_request():146] handle_request: status_report
wandb/run-20240426_164617-71zld9et/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-26 16:46:17,096 INFO MainThread:306341 [wandb_setup.py:_flush():76] Current SDK version is 0.16.6
2
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Configure stats pid to 306341
3
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/settings
5
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_dpo.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/run_dpo.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/./run_dpo.py'}
8
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:_log_setup():521] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/logs/debug.log
10
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:_log_setup():522] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-dpo-ultrafeedback-200k/wandb/run-20240426_164617-71zld9et/logs/debug-internal.log
11
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:init():561] calling init triggers
12
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:init():568] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:init():611] starting backend
15
+ 2024-04-26 16:46:17,097 INFO MainThread:306341 [wandb_init.py:init():615] setting up manager
16
+ 2024-04-26 16:46:17,102 INFO MainThread:306341 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-04-26 16:46:17,108 INFO MainThread:306341 [wandb_init.py:init():623] backend started and connected
18
+ 2024-04-26 16:46:17,111 INFO MainThread:306341 [wandb_init.py:init():715] updated telemetry
19
+ 2024-04-26 16:46:17,156 INFO MainThread:306341 [wandb_init.py:init():748] communicating run to backend with 90.0 second timeout
20
+ 2024-04-26 16:46:17,363 INFO MainThread:306341 [wandb_run.py:_on_init():2357] communicating current version
21
+ 2024-04-26 16:46:17,413 INFO MainThread:306341 [wandb_run.py:_on_init():2366] got version response
22
+ 2024-04-26 16:46:17,414 INFO MainThread:306341 [wandb_init.py:init():799] starting run threads in backend
23
+ 2024-04-26 16:46:22,046 INFO MainThread:306341 [wandb_run.py:_console_start():2335] atexit reg
24
+ 2024-04-26 16:46:22,046 INFO MainThread:306341 [wandb_run.py:_redirect():2190] redirect: wrap_raw
25
+ 2024-04-26 16:46:22,046 INFO MainThread:306341 [wandb_run.py:_redirect():2255] Wrapping output streams.
26
+ 2024-04-26 16:46:22,046 INFO MainThread:306341 [wandb_run.py:_redirect():2280] Redirects installed.
27
+ 2024-04-26 16:46:22,047 INFO MainThread:306341 [wandb_init.py:init():842] run started, returning control to user process
28
+ 2024-04-26 16:46:22,048 INFO MainThread:306341 [wandb_run.py:_config_callback():1347] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 1024, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat-200k', 'transformers_version': '4.40.1', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 5e-07, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr26_16-38-17_ip-26-0-161-178', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'beta': 0.01, 'hub_model_revision': 'main', 'max_prompt_length': 512, 'loss_type': 'sigmoid'}
wandb/run-20240426_164617-71zld9et/run-71zld9et.wandb ADDED
Binary file (33.2 kB). View file