sanchit-gandhi HF staff commited on
Commit
4ea2eae
1 Parent(s): cbea69c

Training in progress, step 500

Browse files
Files changed (43) hide show
  1. accelerate_config.yaml +18 -0
  2. alignment/__init__.py +12 -0
  3. alignment/__pycache__/__init__.cpython-311.pyc +0 -0
  4. alignment/__pycache__/configs.cpython-311.pyc +0 -0
  5. alignment/__pycache__/data.cpython-311.pyc +0 -0
  6. alignment/__pycache__/model_utils.cpython-311.pyc +0 -0
  7. alignment/configs.py +254 -0
  8. alignment/data.py +190 -0
  9. alignment/model_utils.py +119 -0
  10. alignment/release.py +106 -0
  11. config.json +26 -0
  12. config_full.yaml +45 -0
  13. model.safetensors +3 -0
  14. run_sft.py +218 -0
  15. runs/Apr24_14-23-38_ip-26-0-162-233/events.out.tfevents.1713973415.ip-26-0-162-233.1840687.0 +3 -0
  16. runs/Apr24_16-42-31_ip-26-0-162-233/events.out.tfevents.1713977002.ip-26-0-162-233.1854033.0 +3 -0
  17. slurm_job.slurm +76 -0
  18. special_tokens_map.json +24 -0
  19. tokenizer.json +0 -0
  20. tokenizer.model +3 -0
  21. tokenizer_config.json +43 -0
  22. training_args.bin +3 -0
  23. wandb/debug-cli.sanchit.log +0 -0
  24. wandb/debug-internal.log +0 -0
  25. wandb/debug.log +28 -0
  26. wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml +300 -0
  27. wandb/run-20240424_154339-mwp0iutr/files/config.yaml +663 -0
  28. wandb/run-20240424_154339-mwp0iutr/files/output.log +131 -0
  29. wandb/run-20240424_154339-mwp0iutr/files/requirements.txt +223 -0
  30. wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json +558 -0
  31. wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json +1 -0
  32. wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log +209 -0
  33. wandb/run-20240424_154339-mwp0iutr/logs/debug.log +29 -0
  34. wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb +0 -0
  35. wandb/run-20240424_164324-xfbnm7qo/files/conda-environment.yaml +300 -0
  36. wandb/run-20240424_164324-xfbnm7qo/files/config.yaml +663 -0
  37. wandb/run-20240424_164324-xfbnm7qo/files/output.log +522 -0
  38. wandb/run-20240424_164324-xfbnm7qo/files/requirements.txt +223 -0
  39. wandb/run-20240424_164324-xfbnm7qo/files/wandb-metadata.json +558 -0
  40. wandb/run-20240424_164324-xfbnm7qo/files/wandb-summary.json +1 -0
  41. wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log +0 -0
  42. wandb/run-20240424_164324-xfbnm7qo/logs/debug.log +28 -0
  43. wandb/run-20240424_164324-xfbnm7qo/run-xfbnm7qo.wandb +0 -0
accelerate_config.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ enable_cpu_affinity: false
6
+ gpu_ids: all
7
+ machine_rank: 0
8
+ main_training_function: main
9
+ mixed_precision: bf16
10
+ num_machines: 1
11
+ num_processes: 8
12
+ rdzv_backend: static
13
+ same_network: true
14
+ tpu_env: []
15
+ tpu_use_cluster: false
16
+ tpu_use_sudo: false
17
+ use_cpu: false
18
+
alignment/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = "0.3.0.dev0"
2
+
3
+ from .configs import DataArguments, DPOConfig, H4ArgumentParser, ModelArguments, SFTConfig
4
+ from .data import apply_chat_template, get_datasets
5
+ from .model_utils import (
6
+ get_checkpoint,
7
+ get_kbit_device_map,
8
+ get_peft_config,
9
+ get_quantization_config,
10
+ get_tokenizer,
11
+ is_adapter_model,
12
+ )
alignment/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (752 Bytes). View file
 
alignment/__pycache__/configs.cpython-311.pyc ADDED
Binary file (14.1 kB). View file
 
alignment/__pycache__/data.cpython-311.pyc ADDED
Binary file (9.06 kB). View file
 
alignment/__pycache__/model_utils.cpython-311.pyc ADDED
Binary file (5.05 kB). View file
 
alignment/configs.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import dataclasses
16
+ import os
17
+ import sys
18
+ from dataclasses import dataclass, field
19
+ from typing import Any, Dict, List, NewType, Optional, Tuple
20
+
21
+ import transformers
22
+ from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, HfArgumentParser
23
+
24
+
25
+ MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
26
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
27
+
28
+
29
+ DataClassType = NewType("DataClassType", Any)
30
+
31
+
32
+ class H4ArgumentParser(HfArgumentParser):
33
+ def parse_yaml_and_args(self, yaml_arg: str, other_args: Optional[List[str]] = None) -> List[dataclass]:
34
+ """
35
+ Parse a YAML file and overwrite the default/loaded values with the values provided to the command line.
36
+
37
+ Args:
38
+ yaml_arg (`str`):
39
+ The path to the config file used
40
+ other_args (`List[str]`, *optional`):
41
+ A list of strings to parse as command line arguments, e.g. ['--arg=val', '--arg2=val2'].
42
+
43
+ Returns:
44
+ [`List[dataclass]`]: a list of dataclasses with the values from the YAML file and the command line
45
+ """
46
+ arg_list = self.parse_yaml_file(os.path.abspath(yaml_arg))
47
+
48
+ outputs = []
49
+ # strip other args list into dict of key-value pairs
50
+ other_args = {arg.split("=")[0].strip("-"): arg.split("=")[1] for arg in other_args}
51
+ used_args = {}
52
+
53
+ # overwrite the default/loaded value with the value provided to the command line
54
+ # adapted from https://github.com/huggingface/transformers/blob/d0b5002378daabf62769159add3e7d66d3f83c3b/src/transformers/hf_argparser.py#L327
55
+ for data_yaml, data_class in zip(arg_list, self.dataclass_types):
56
+ keys = {f.name for f in dataclasses.fields(data_yaml) if f.init}
57
+ inputs = {k: v for k, v in vars(data_yaml).items() if k in keys}
58
+ for arg, val in other_args.items():
59
+ # add only if in keys
60
+ if arg in keys:
61
+ base_type = data_yaml.__dataclass_fields__[arg].type
62
+ inputs[arg] = val
63
+
64
+ # cast type for ints, floats (default to strings)
65
+ if base_type in [int, float]:
66
+ inputs[arg] = base_type(val)
67
+
68
+ if base_type == List[str]:
69
+ inputs[arg] = [str(v) for v in val.split(",")]
70
+
71
+ # bool of a non-empty string is True, so we manually check for bools
72
+ if base_type == bool:
73
+ if val in ["true", "True"]:
74
+ inputs[arg] = True
75
+ else:
76
+ inputs[arg] = False
77
+
78
+ # add to used-args so we can check if double add
79
+ if arg not in used_args:
80
+ used_args[arg] = val
81
+ else:
82
+ raise ValueError(f"Duplicate argument provided: {arg}, may cause unexpected behavior")
83
+
84
+ obj = data_class(**inputs)
85
+ outputs.append(obj)
86
+
87
+ return outputs
88
+
89
+ def parse(self) -> DataClassType | Tuple[DataClassType]:
90
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
91
+ # If we pass only one argument to the script and it's the path to a YAML file,
92
+ # let's parse it to get our arguments.
93
+ output = self.parse_yaml_file(os.path.abspath(sys.argv[1]))
94
+ # parse command line args and yaml file
95
+ elif len(sys.argv) > 2 and sys.argv[1].endswith(".yaml"):
96
+ output = self.parse_yaml_and_args(os.path.abspath(sys.argv[1]), sys.argv[2:])
97
+ # parse command line args only
98
+ else:
99
+ output = self.parse_args_into_dataclasses()
100
+
101
+ if len(output) == 1:
102
+ output = output[0]
103
+ return output
104
+
105
+
106
+ @dataclass
107
+ class ModelArguments:
108
+ """
109
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
110
+ """
111
+
112
+ base_model_revision: Optional[str] = field(
113
+ default=None,
114
+ metadata={"help": ("The base model checkpoint for weights initialization with PEFT adatpers.")},
115
+ )
116
+ model_name_or_path: Optional[str] = field(
117
+ default=None,
118
+ metadata={
119
+ "help": (
120
+ "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
121
+ )
122
+ },
123
+ )
124
+ model_revision: str = field(
125
+ default="main",
126
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
127
+ )
128
+ model_code_revision: str = field(default=None, metadata={"help": "The branch of the IFT model"})
129
+ torch_dtype: Optional[str] = field(
130
+ default=None,
131
+ metadata={
132
+ "help": (
133
+ "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
134
+ "dtype will be automatically derived from the model's weights."
135
+ ),
136
+ "choices": ["auto", "bfloat16", "float16", "float32"],
137
+ },
138
+ )
139
+ trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
140
+ use_flash_attention_2: bool = field(
141
+ default=False,
142
+ metadata={
143
+ "help": (
144
+ "Whether to use flash attention 2. You must install this manually by running `pip install flash-attn --no-build-isolation`"
145
+ )
146
+ },
147
+ )
148
+ use_peft: bool = field(
149
+ default=False,
150
+ metadata={"help": ("Whether to use PEFT or not for training.")},
151
+ )
152
+ lora_r: Optional[int] = field(
153
+ default=16,
154
+ metadata={"help": ("LoRA R value.")},
155
+ )
156
+ lora_alpha: Optional[int] = field(
157
+ default=32,
158
+ metadata={"help": ("LoRA alpha.")},
159
+ )
160
+ lora_dropout: Optional[float] = field(
161
+ default=0.05,
162
+ metadata={"help": ("LoRA dropout.")},
163
+ )
164
+ lora_target_modules: Optional[List[str]] = field(
165
+ default=None,
166
+ metadata={"help": ("LoRA target modules.")},
167
+ )
168
+ lora_modules_to_save: Optional[List[str]] = field(
169
+ default=None,
170
+ metadata={"help": ("Model layers to unfreeze & train")},
171
+ )
172
+ load_in_8bit: bool = field(default=False, metadata={"help": "use 8 bit precision"})
173
+ load_in_4bit: bool = field(default=False, metadata={"help": "use 4 bit precision"})
174
+
175
+ bnb_4bit_quant_type: Optional[str] = field(
176
+ default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
177
+ )
178
+ use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
179
+
180
+ def __post_init__(self):
181
+ if self.load_in_8bit and self.load_in_4bit:
182
+ raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
183
+
184
+
185
+ @dataclass
186
+ class DataArguments:
187
+ """
188
+ Arguments pertaining to what data we are going to input our model for training and eval.
189
+ """
190
+
191
+ chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
192
+ dataset_mixer: Optional[Dict[str, float]] = field(
193
+ default=None,
194
+ metadata={"help": ("Datasets and their proportions to be used for training ift/rl.")},
195
+ )
196
+ dataset_splits: Optional[List[str]] = field(
197
+ default_factory=lambda: ["train", "test"],
198
+ metadata={"help": ("List of train test splits to use in the dataset")},
199
+ )
200
+ preprocessing_num_workers: Optional[int] = field(
201
+ default=None,
202
+ metadata={"help": "The number of processes to use for the preprocessing."},
203
+ )
204
+ truncation_side: Optional[str] = field(
205
+ default=None, metadata={"help": "Truncation side to use for the tokenizer."}
206
+ )
207
+
208
+
209
+ @dataclass
210
+ class SFTConfig(transformers.TrainingArguments):
211
+ """
212
+ Arguments related to the training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
213
+ """
214
+
215
+ max_seq_length: Optional[int] = field(
216
+ default=None,
217
+ metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
218
+ )
219
+ logging_first_step: bool = field(
220
+ default=True,
221
+ metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
222
+ )
223
+ optim: Optional[str] = field(default="adamw_torch")
224
+
225
+
226
+ @dataclass
227
+ class DPOConfig(transformers.TrainingArguments):
228
+ """
229
+ Arguments related to the DPO training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
230
+ """
231
+
232
+ beta: Optional[float] = field(
233
+ default=0.1,
234
+ metadata={"help": "The beta factor in DPO loss. Higher beta means less divergence from the initial policy."},
235
+ )
236
+ hub_model_revision: Optional[str] = field(
237
+ default="main",
238
+ metadata={"help": ("The Hub model branch to push the model to.")},
239
+ )
240
+ logging_first_step: bool = field(
241
+ default=True,
242
+ metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
243
+ )
244
+ max_prompt_length: Optional[int] = field(
245
+ default=None,
246
+ metadata={"help": ("For DPO, the maximum length of the prompt to use for conditioning the model.")},
247
+ )
248
+ max_length: Optional[int] = field(
249
+ default=None,
250
+ metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
251
+ )
252
+ optim: Optional[str] = field(default="rmsprop")
253
+ remove_unused_columns: bool = field(default=False)
254
+ loss_type: Optional[str] = field(default="sigmoid", metadata={"help": ("The loss type for DPO.")})
alignment/data.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import os
16
+ from typing import List, Literal, Optional
17
+
18
+ from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
19
+ from datasets.builder import DatasetGenerationError
20
+
21
+ from .configs import DataArguments
22
+
23
+
24
+ DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
25
+
26
+
27
+ def maybe_insert_system_message(messages, tokenizer):
28
+ if messages[0]["role"] == "system":
29
+ return
30
+
31
+ # chat template can be one of two attributes, we check in order
32
+ chat_template = tokenizer.chat_template
33
+ if chat_template is None:
34
+ chat_template = tokenizer.default_chat_template
35
+
36
+ # confirm the jinja template refers to a system message before inserting
37
+ if "system" in chat_template:
38
+ messages.insert(0, {"role": "system", "content": ""})
39
+
40
+
41
+ def apply_chat_template(
42
+ example,
43
+ tokenizer,
44
+ task: Literal["sft", "generation", "rm", "dpo"],
45
+ ):
46
+ if task in ["sft", "generation"]:
47
+ messages = example["messages"]
48
+ # We add an empty system message if there is none
49
+ maybe_insert_system_message(messages, tokenizer)
50
+ example["text"] = tokenizer.apply_chat_template(
51
+ messages, tokenize=False, add_generation_prompt=True if task == "generation" else False
52
+ )
53
+ elif task == "rm":
54
+ if all(k in example.keys() for k in ("chosen", "rejected")):
55
+ chosen_messages = example["chosen"]
56
+ rejected_messages = example["rejected"]
57
+ # We add an empty system message if there is none
58
+ maybe_insert_system_message(chosen_messages, tokenizer)
59
+ maybe_insert_system_message(rejected_messages, tokenizer)
60
+
61
+ example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
62
+ example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
63
+ else:
64
+ raise ValueError(
65
+ f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
66
+ )
67
+ elif task == "dpo":
68
+ if all(k in example.keys() for k in ("chosen", "rejected")):
69
+ # For DPO, the inputs are triples of (prompt, chosen, rejected), where `chosen` and `rejected` are the final turn of a dialogue
70
+ # We therefore need to extract the N-1 turns to form the prompt
71
+ prompt_messages = example["chosen"][:-1]
72
+ # Prepend a system message if the first message is not a system message
73
+ if example["chosen"][0]["role"] != "system":
74
+ prompt_messages.insert(0, {"role": "system", "content": ""})
75
+ # Now we extract the final turn to define chosen/rejected responses
76
+ chosen_messages = example["chosen"][-1:]
77
+ rejected_messages = example["rejected"][-1:]
78
+ example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
79
+ example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
80
+ example["text_prompt"] = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
81
+ else:
82
+ raise ValueError(
83
+ f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
84
+ )
85
+ else:
86
+ raise ValueError(
87
+ f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
88
+ )
89
+ return example
90
+
91
+
92
+ def get_datasets(
93
+ data_config: DataArguments | dict,
94
+ splits: List[str] = ["train", "test"],
95
+ shuffle: bool = True,
96
+ ) -> DatasetDict:
97
+ """
98
+ Loads one or more datasets with varying training set proportions.
99
+
100
+ Args:
101
+ data_config (`DataArguments` or `dict`):
102
+ Dataset configuration and split proportions.
103
+ splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
104
+ Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
105
+ shuffle (`bool`, *optional*, defaults to `True`):
106
+ Whether to shuffle the training and testing/validation data.
107
+
108
+ Returns
109
+ [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
110
+ """
111
+
112
+ if type(data_config) is DataArguments:
113
+ # Structure of the config to read the datasets and their mix
114
+ # datasets_mixer:
115
+ # - 'dataset1': 0.5
116
+ # - 'dataset2': 0.3
117
+ # - 'dataset3': 0.2
118
+ dataset_mixer = data_config.dataset_mixer
119
+ elif isinstance(data_config, dict):
120
+ # Structure of the input is:
121
+ # dataset_mixer = {
122
+ # "dataset1": 0.5,
123
+ # "dataset1": 0.3,
124
+ # "dataset1": 0.2,
125
+ # }
126
+ dataset_mixer = data_config
127
+ else:
128
+ raise ValueError(f"Data config {data_config} not recognized.")
129
+
130
+ raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
131
+ return raw_datasets
132
+
133
+
134
+ def mix_datasets(dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True) -> DatasetDict:
135
+ """
136
+ Loads and mixes datasets according to proportions specified in `dataset_mixer`.
137
+
138
+ Args:
139
+ dataset_mixer (`dict`):
140
+ Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
141
+ splits (Optional[List[str]], *optional*, defaults to `None`):
142
+ Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
143
+ shuffle (`bool`, *optional*, defaults to `True`):
144
+ Whether to shuffle the training and testing/validation data.
145
+ """
146
+ raw_datasets = DatasetDict()
147
+ raw_train_datasets = []
148
+ raw_val_datasets = []
149
+ fracs = []
150
+ for ds, frac in dataset_mixer.items():
151
+ fracs.append(frac)
152
+ for idx, split in enumerate(splits):
153
+ try:
154
+ # Try first if dataset on a Hub repo
155
+ dataset = load_dataset(ds, split=split)
156
+ except DatasetGenerationError:
157
+ # If not, check local dataset
158
+ dataset = load_from_disk(os.path.join(ds, split))
159
+
160
+ if idx == 0:
161
+ raw_train_datasets.append(dataset)
162
+ else:
163
+ raw_val_datasets.append(dataset)
164
+
165
+ if any(frac < 0 for frac in fracs):
166
+ raise ValueError("Dataset fractions cannot be negative.")
167
+
168
+ if len(raw_train_datasets) > 0:
169
+ train_subsets = []
170
+ for dataset, frac in zip(raw_train_datasets, fracs):
171
+ train_subset = dataset.select(range(int(frac * len(dataset))))
172
+ train_subsets.append(train_subset)
173
+ if shuffle:
174
+ raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
175
+ else:
176
+ raw_datasets["train"] = concatenate_datasets(train_subsets)
177
+ # No subsampling for test datasets to enable fair comparison across models
178
+ if len(raw_val_datasets) > 0:
179
+ if shuffle:
180
+ raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
181
+ else:
182
+ raw_datasets["test"] = concatenate_datasets(raw_val_datasets)
183
+
184
+ if len(raw_datasets) == 0:
185
+ raise ValueError(
186
+ f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
187
+ )
188
+
189
+ return raw_datasets
190
+
alignment/model_utils.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import os
16
+ from pathlib import Path
17
+ from typing import Dict
18
+
19
+ import torch
20
+ from transformers import AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer
21
+ from transformers.trainer_utils import get_last_checkpoint
22
+
23
+ from accelerate import Accelerator
24
+ from huggingface_hub import list_repo_files
25
+ from huggingface_hub.utils._validators import HFValidationError
26
+ from peft import LoraConfig, PeftConfig
27
+
28
+ from .configs import DataArguments, DPOConfig, ModelArguments, SFTConfig
29
+ from .data import DEFAULT_CHAT_TEMPLATE
30
+
31
+
32
+ def get_current_device() -> int:
33
+ """Get the current device. For GPU we return the local process index to enable multiple GPU training."""
34
+ return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
35
+
36
+
37
+ def get_kbit_device_map() -> Dict[str, int] | None:
38
+ """Useful for running inference with quantized models by setting `device_map=get_peft_device_map()`"""
39
+ return {"": get_current_device()} if torch.cuda.is_available() else None
40
+
41
+
42
+ def get_quantization_config(model_args: ModelArguments) -> BitsAndBytesConfig | None:
43
+ if model_args.load_in_4bit:
44
+ compute_dtype = torch.float16
45
+ if model_args.torch_dtype not in {"auto", None}:
46
+ compute_dtype = getattr(torch, model_args.torch_dtype)
47
+
48
+ quantization_config = BitsAndBytesConfig(
49
+ load_in_4bit=True,
50
+ bnb_4bit_compute_dtype=compute_dtype,
51
+ bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
52
+ bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
53
+ )
54
+ elif model_args.load_in_8bit:
55
+ quantization_config = BitsAndBytesConfig(
56
+ load_in_8bit=True,
57
+ )
58
+ else:
59
+ quantization_config = None
60
+
61
+ return quantization_config
62
+
63
+
64
+ def get_tokenizer(model_args: ModelArguments, data_args: DataArguments) -> PreTrainedTokenizer:
65
+ """Get the tokenizer for the model."""
66
+ tokenizer = AutoTokenizer.from_pretrained(
67
+ model_args.model_name_or_path,
68
+ revision=model_args.model_revision,
69
+ )
70
+ if tokenizer.pad_token_id is None:
71
+ tokenizer.pad_token_id = tokenizer.eos_token_id
72
+
73
+ if data_args.truncation_side is not None:
74
+ tokenizer.truncation_side = data_args.truncation_side
75
+
76
+ # Set reasonable default for models without max length
77
+ if tokenizer.model_max_length > 100_000:
78
+ tokenizer.model_max_length = 2048
79
+
80
+ if data_args.chat_template is not None:
81
+ tokenizer.chat_template = data_args.chat_template
82
+ elif tokenizer.chat_template is None and tokenizer.default_chat_template is None:
83
+ tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
84
+
85
+ return tokenizer
86
+
87
+
88
+ def get_peft_config(model_args: ModelArguments) -> PeftConfig | None:
89
+ if model_args.use_peft is False:
90
+ return None
91
+
92
+ peft_config = LoraConfig(
93
+ r=model_args.lora_r,
94
+ lora_alpha=model_args.lora_alpha,
95
+ lora_dropout=model_args.lora_dropout,
96
+ bias="none",
97
+ task_type="CAUSAL_LM",
98
+ target_modules=model_args.lora_target_modules,
99
+ modules_to_save=model_args.lora_modules_to_save,
100
+ )
101
+
102
+ return peft_config
103
+
104
+
105
+ def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
106
+ try:
107
+ # Try first if model on a Hub repo
108
+ repo_files = list_repo_files(model_name_or_path, revision=revision)
109
+ except HFValidationError:
110
+ # If not, check local repo
111
+ repo_files = os.listdir(model_name_or_path)
112
+ return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files
113
+
114
+
115
+ def get_checkpoint(training_args: SFTConfig | DPOConfig) -> Path | None:
116
+ last_checkpoint = None
117
+ if os.path.isdir(training_args.output_dir):
118
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
119
+ return last_checkpoint
alignment/release.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import argparse
17
+ import re
18
+
19
+ import packaging.version
20
+
21
+
22
+ REPLACE_PATTERNS = {
23
+ "init": (re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE), '__version__ = "VERSION"\n'),
24
+ "setup": (re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE), r'\1version="VERSION",'),
25
+ }
26
+ REPLACE_FILES = {
27
+ "init": "src/alignment/__init__.py",
28
+ "setup": "setup.py",
29
+ }
30
+ README_FILE = "README.md"
31
+
32
+
33
+ def update_version_in_file(fname, version, pattern):
34
+ """Update the version in one file using a specific pattern."""
35
+ with open(fname, "r", encoding="utf-8", newline="\n") as f:
36
+ code = f.read()
37
+ re_pattern, replace = REPLACE_PATTERNS[pattern]
38
+ replace = replace.replace("VERSION", version)
39
+ code = re_pattern.sub(replace, code)
40
+ with open(fname, "w", encoding="utf-8", newline="\n") as f:
41
+ f.write(code)
42
+
43
+
44
+ def global_version_update(version, patch=False):
45
+ """Update the version in all needed files."""
46
+ for pattern, fname in REPLACE_FILES.items():
47
+ update_version_in_file(fname, version, pattern)
48
+
49
+
50
+ def get_version():
51
+ """Reads the current version in the __init__."""
52
+ with open(REPLACE_FILES["init"], "r") as f:
53
+ code = f.read()
54
+ default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
55
+ return packaging.version.parse(default_version)
56
+
57
+
58
+ def pre_release_work(patch=False):
59
+ """Do all the necessary pre-release steps."""
60
+ # First let's get the default version: base version if we are in dev, bump minor otherwise.
61
+ default_version = get_version()
62
+ if patch and default_version.is_devrelease:
63
+ raise ValueError("Can't create a patch version from the dev branch, checkout a released version!")
64
+ if default_version.is_devrelease:
65
+ default_version = default_version.base_version
66
+ elif patch:
67
+ default_version = f"{default_version.major}.{default_version.minor}.{default_version.micro + 1}"
68
+ else:
69
+ default_version = f"{default_version.major}.{default_version.minor + 1}.0"
70
+
71
+ # Now let's ask nicely if that's the right one.
72
+ version = input(f"Which version are you releasing? [{default_version}]")
73
+ if len(version) == 0:
74
+ version = default_version
75
+
76
+ print(f"Updating version to {version}.")
77
+ global_version_update(version, patch=patch)
78
+
79
+
80
+ def post_release_work():
81
+ """Do all the necessary post-release steps."""
82
+ # First let's get the current version
83
+ current_version = get_version()
84
+ dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0"
85
+ current_version = current_version.base_version
86
+
87
+ # Check with the user we got that right.
88
+ version = input(f"Which version are we developing now? [{dev_version}]")
89
+ if len(version) == 0:
90
+ version = dev_version
91
+
92
+ print(f"Updating version to {version}.")
93
+ global_version_update(version)
94
+
95
+
96
+ if __name__ == "__main__":
97
+ parser = argparse.ArgumentParser()
98
+ parser.add_argument("--post_release", action="store_true", help="Whether this is pre or post release.")
99
+ parser.add_argument("--patch", action="store_true", help="Whether or not this is a patch release.")
100
+ args = parser.parse_args()
101
+ if not args.post_release:
102
+ pre_release_work(patch=args.patch)
103
+ elif args.patch:
104
+ print("Nothing to do after a patch :-)")
105
+ else:
106
+ post_release_work()
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sanchit-gandhi/Mistral-7B-v0.1-6-layer",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 14336,
13
+ "max_position_embeddings": 32768,
14
+ "model_type": "mistral",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 6,
17
+ "num_key_value_heads": 8,
18
+ "rms_norm_eps": 1e-05,
19
+ "rope_theta": 10000.0,
20
+ "sliding_window": 4096,
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.40.0.dev0",
24
+ "use_cache": false,
25
+ "vocab_size": 32000
26
+ }
config_full.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: sanchit-gandhi/Mistral-7B-v0.1-6-layer
3
+ model_revision: main
4
+ torch_dtype: bfloat16
5
+ use_flash_attention_2: false # torch sdpa sufficient
6
+
7
+ # Data training arguments
8
+ chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
9
+ dataset_mixer:
10
+ stingning/ultrachat: 1.0
11
+ dataset_splits:
12
+ - train[1000:]
13
+ - train[:1000]
14
+ preprocessing_num_workers: 32
15
+
16
+ # SFT trainer config
17
+ bf16: true
18
+ do_eval: true
19
+ evaluation_strategy: steps
20
+ eval_steps: 5000
21
+ save_strategy: "steps"
22
+ save_total_limit: 5000
23
+ gradient_accumulation_steps: 1
24
+ gradient_checkpointing: true
25
+ gradient_checkpointing_kwargs:
26
+ use_reentrant: False
27
+ hub_strategy: every_save
28
+ learning_rate: 0.0001
29
+ log_level: info
30
+ logging_steps: 25
31
+ logging_strategy: steps
32
+ max_seq_length: 2048
33
+ max_steps: 20000
34
+ output_dir: ./
35
+ overwrite_output_dir: true
36
+ per_device_eval_batch_size: 32
37
+ per_device_train_batch_size: 32
38
+ push_to_hub: true
39
+ remove_unused_columns: true
40
+ report_to:
41
+ - tensorboard
42
+ - wandb
43
+ seed: 42
44
+ warmup_steps: 500
45
+ ddp_timeout: 7200
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7f84b30ad1e26b72493f2e487a84b8fb077327a611d56fcd0605d78146fa822
3
+ size 3141646744
run_sft.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Supervised fine-tuning script for decoder language models.
18
+ """
19
+
20
+ import logging
21
+ import random
22
+ import sys
23
+
24
+ import datasets
25
+ import torch
26
+ import transformers
27
+ from transformers import set_seed
28
+
29
+ from alignment import (
30
+ DataArguments,
31
+ H4ArgumentParser,
32
+ ModelArguments,
33
+ SFTConfig,
34
+ apply_chat_template,
35
+ get_checkpoint,
36
+ get_datasets,
37
+ get_kbit_device_map,
38
+ get_peft_config,
39
+ get_quantization_config,
40
+ get_tokenizer,
41
+ )
42
+ from trl import SFTTrainer
43
+
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ def main():
49
+ parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
50
+ model_args, data_args, training_args = parser.parse()
51
+
52
+ # Set seed for reproducibility
53
+ set_seed(training_args.seed)
54
+
55
+ ###############
56
+ # Setup logging
57
+ ###############
58
+ logging.basicConfig(
59
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
60
+ datefmt="%Y-%m-%d %H:%M:%S",
61
+ handlers=[logging.StreamHandler(sys.stdout)],
62
+ )
63
+ log_level = training_args.get_process_log_level()
64
+ logger.setLevel(log_level)
65
+ datasets.utils.logging.set_verbosity(log_level)
66
+ transformers.utils.logging.set_verbosity(log_level)
67
+ transformers.utils.logging.enable_default_handler()
68
+ transformers.utils.logging.enable_explicit_format()
69
+
70
+ # Log on each process a small summary
71
+ logger.warning(
72
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
73
+ + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
74
+ )
75
+ logger.info(f"Model parameters {model_args}")
76
+ logger.info(f"Data parameters {data_args}")
77
+ logger.info(f"Training/evaluation parameters {training_args}")
78
+
79
+ # Check for last checkpoint
80
+ last_checkpoint = get_checkpoint(training_args)
81
+ if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
82
+ logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
83
+
84
+ ###############
85
+ # Load datasets
86
+ ###############
87
+ raw_datasets = get_datasets(data_args, splits=data_args.dataset_splits)
88
+ logger.info(
89
+ f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
90
+ )
91
+ column_names = list(raw_datasets["train"].features)
92
+ if "messages" not in column_names:
93
+ with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
94
+ def format_messages(example):
95
+ messages = []
96
+ for idx, message in enumerate(example["data"]):
97
+ role = "user" if idx % 2 == 0 else "assistant"
98
+ messages.append({"content": message, "role": role})
99
+ example["messages"] = messages
100
+ return example
101
+
102
+ raw_datasets = raw_datasets.map(format_messages, desc="Formatting messages", num_proc=data_args.preprocessing_num_workers)
103
+
104
+ ################
105
+ # Load tokenizer
106
+ ################
107
+ tokenizer = get_tokenizer(model_args, data_args)
108
+
109
+ #####################
110
+ # Apply chat template
111
+ #####################
112
+ with training_args.main_process_first():
113
+ raw_datasets = raw_datasets.map(
114
+ apply_chat_template,
115
+ fn_kwargs={"tokenizer": tokenizer, "task": "sft"},
116
+ num_proc=data_args.preprocessing_num_workers,
117
+ remove_columns=column_names,
118
+ desc="Applying chat template",
119
+ )
120
+ train_dataset = raw_datasets["train"]
121
+ eval_dataset = raw_datasets["test"]
122
+
123
+ with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
124
+ for index in random.sample(range(len(raw_datasets["train"])), 3):
125
+ logger.info(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")
126
+
127
+ #######################
128
+ # Load pretrained model
129
+ #######################
130
+ logger.info("*** Load pretrained model ***")
131
+ torch_dtype = (
132
+ model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
133
+ )
134
+ quantization_config = get_quantization_config(model_args)
135
+
136
+ model_kwargs = dict(
137
+ revision=model_args.model_revision,
138
+ trust_remote_code=model_args.trust_remote_code,
139
+ use_flash_attention_2=model_args.use_flash_attention_2,
140
+ torch_dtype=torch_dtype,
141
+ use_cache=False if training_args.gradient_checkpointing else True,
142
+ device_map=get_kbit_device_map() if quantization_config is not None else None,
143
+ quantization_config=quantization_config,
144
+ )
145
+ logger.info("*** Model loaded! ***")
146
+
147
+ ########################
148
+ # Initialize the Trainer
149
+ ########################
150
+ trainer = SFTTrainer(
151
+ model=model_args.model_name_or_path,
152
+ model_init_kwargs=model_kwargs,
153
+ args=training_args,
154
+ train_dataset=train_dataset,
155
+ eval_dataset=eval_dataset,
156
+ dataset_text_field="text",
157
+ max_seq_length=training_args.max_seq_length,
158
+ tokenizer=tokenizer,
159
+ packing=True,
160
+ peft_config=get_peft_config(model_args),
161
+ )
162
+
163
+ ###############
164
+ # Training loop
165
+ ###############
166
+ logger.info("*** Train ***")
167
+ checkpoint = None
168
+ if training_args.resume_from_checkpoint is not None:
169
+ checkpoint = training_args.resume_from_checkpoint
170
+ elif last_checkpoint is not None:
171
+ checkpoint = last_checkpoint
172
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
173
+ metrics = train_result.metrics
174
+ metrics["train_samples"] = len(train_dataset)
175
+ trainer.log_metrics("train", metrics)
176
+ trainer.save_metrics("train", metrics)
177
+ trainer.save_state()
178
+
179
+ ##########
180
+ # Evaluate
181
+ ##########
182
+ if training_args.do_eval:
183
+ logger.info("*** Evaluate ***")
184
+ metrics = trainer.evaluate()
185
+ metrics["eval_samples"] = len(eval_dataset)
186
+ trainer.log_metrics("eval", metrics)
187
+ trainer.save_metrics("eval", metrics)
188
+
189
+ ##################################
190
+ # Save model and create model card
191
+ ##################################
192
+ logger.info("*** Save model ***")
193
+ trainer.save_model(training_args.output_dir)
194
+ logger.info(f"Model saved to {training_args.output_dir}")
195
+
196
+ # Save everything else on main process
197
+ kwargs = {
198
+ "finetuned_from": model_args.model_name_or_path,
199
+ "dataset": list(data_args.dataset_mixer.keys()),
200
+ "dataset_tags": list(data_args.dataset_mixer.keys()),
201
+ "tags": ["alignment-handbook"],
202
+ }
203
+ if trainer.accelerator.is_main_process:
204
+ trainer.create_model_card(**kwargs)
205
+ # Restore k,v cache for fast inference
206
+ trainer.model.config.use_cache = True
207
+ trainer.model.config.save_pretrained(training_args.output_dir)
208
+
209
+ if training_args.push_to_hub is True:
210
+ logger.info("Pushing to hub...")
211
+ trainer.push_to_hub(**kwargs)
212
+
213
+ logger.info("*** Training complete ***")
214
+
215
+
216
+ if __name__ == "__main__":
217
+ main()
218
+
runs/Apr24_14-23-38_ip-26-0-162-233/events.out.tfevents.1713973415.ip-26-0-162-233.1840687.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:713e8ed73c7d50dde946e1af7c24c1babc165667a442d5f8e8f3674cf32ae072
3
+ size 4886
runs/Apr24_16-42-31_ip-26-0-162-233/events.out.tfevents.1713977002.ip-26-0-162-233.1854033.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d99d51bb7fcd506f76273107b7f54b0a07695a2cf620317840bf9823aa458c38
3
+ size 9086
slurm_job.slurm ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=distil-zephyr
3
+ #SBATCH --nodes=1
4
+ # set 24h for job wall time limit
5
+ #SBATCH --time=24:00:00
6
+ #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
7
+ #SBATCH --cpus-per-task=32
8
+ #SBATCH --gres=gpu:8
9
+ #SBATCH --exclusive
10
+ #SBATCH --partition=hopper-prod
11
+ #SBATCH --output=/fsx/sanchit/alignment-logs/%x-%j.out
12
+
13
+ set -x -e
14
+
15
+ # START EDIT
16
+ source ~/.bashrc
17
+ source /fsx/sanchit/miniconda3/bin/activate alignment
18
+
19
+ LOG_PATH="/fsx/sanchit/alignment-logs/main_log.txt"
20
+ SAVE_DIR="/fsx/sanchit"
21
+ # END EDIT
22
+
23
+ echo "START TIME: $(date)"
24
+
25
+ GPUS_PER_NODE=8
26
+ NNODES=$SLURM_NNODES
27
+
28
+ # so processes know who to talk to
29
+ MASTER_ADDR=`scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1`
30
+
31
+ # From https://i.hsfzxjy.site/2021-03-10-obtain-a-random-unused-tcp-port-with-bash/
32
+ function unused_port() {
33
+ N=${1:-1}
34
+ comm -23 \
35
+ <(seq "1025" "65535" | sort) \
36
+ <(ss -Htan |
37
+ awk '{print $4}' |
38
+ cut -d':' -f2 |
39
+ sort -u) |
40
+ shuf |
41
+ head -n "$N"
42
+ }
43
+ MASTER_PORT=$(unused_port)
44
+
45
+ # export TORCH_CPP_LOG_LEVEL=INFO
46
+ # export TORCH_DISTRIBUTED_DEBUG=DETAIL
47
+
48
+ export LAUNCHER="python -u -m accelerate.commands.launch --config_file ./accelerate_config.yaml"
49
+
50
+ export PROGRAM="./run_sft.py ./config_full.yaml"
51
+ export CMD="$LAUNCHER $PROGRAM"
52
+ echo $CMD
53
+
54
+ SRUN_ARGS=" \
55
+ --wait=60 \
56
+ --kill-on-bad-exit=1 \
57
+ "
58
+
59
+ # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
60
+ clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$CMD" 2>&1 | tee -a $SAVE_DIR/logs/main_log.txt
61
+
62
+
63
+ # srun error handling:
64
+ # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
65
+ # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
66
+
67
+ # SRUN_ARGS=" \
68
+ # --wait=60 \
69
+ # --kill-on-bad-exit=1 \
70
+ # "
71
+ #
72
+ # # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
73
+ # clear; srun $SRUN_ARGS --jobid $SLURM_JOBID bash -c "$CMD" 2>&1 | tee -a $SAVE_DIR/logs/main_log.txt
74
+
75
+ echo "END TIME: $(date)"
76
+
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1464ab5521091ef44c1647b6866ecc70515e4a2469ed5b7ed407275c3c551c0d
3
+ size 4984
wandb/debug-cli.sanchit.log ADDED
File without changes
wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-24 16:43:24,533 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
2
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Configure stats pid to 1854033
3
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
5
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
8
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug.log
9
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log
10
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():564] calling init triggers
11
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
12
+ config: {}
13
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():614] starting backend
14
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():618] setting up manager
15
+ 2024-04-24 16:43:24,537 INFO MainThread:1854033 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-04-24 16:43:24,541 INFO MainThread:1854033 [wandb_init.py:init():624] backend started and connected
17
+ 2024-04-24 16:43:24,544 INFO MainThread:1854033 [wandb_init.py:init():716] updated telemetry
18
+ 2024-04-24 16:43:24,569 INFO MainThread:1854033 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
19
+ 2024-04-24 16:43:24,850 INFO MainThread:1854033 [wandb_run.py:_on_init():2254] communicating current version
20
+ 2024-04-24 16:43:24,896 INFO MainThread:1854033 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-04-24 16:43:24,896 INFO MainThread:1854033 [wandb_init.py:init():800] starting run threads in backend
23
+ 2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_console_start():2233] atexit reg
24
+ 2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
+ 2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
+ 2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2178] Redirects installed.
27
+ 2024-04-24 16:43:30,533 INFO MainThread:1854033 [wandb_init.py:init():841] run started, returning control to user process
28
+ 2024-04-24 16:43:30,535 INFO MainThread:1854033 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_16-42-31_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: venv
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - defaults
6
+ dependencies:
7
+ - _libgcc_mutex=0.1=main
8
+ - _openmp_mutex=5.1=1_gnu
9
+ - blas=1.0=mkl
10
+ - brotli-python=1.0.9=py311h6a678d5_7
11
+ - bzip2=1.0.8=h7b6447c_0
12
+ - ca-certificates=2023.12.12=h06a4308_0
13
+ - certifi=2023.11.17=py311h06a4308_0
14
+ - cffi=1.16.0=py311h5eee18b_0
15
+ - cryptography=41.0.7=py311hdda0065_0
16
+ - cuda-cudart=12.1.105=0
17
+ - cuda-cupti=12.1.105=0
18
+ - cuda-libraries=12.1.0=0
19
+ - cuda-nvrtc=12.1.105=0
20
+ - cuda-nvtx=12.1.105=0
21
+ - cuda-opencl=12.3.101=0
22
+ - cuda-runtime=12.1.0=0
23
+ - ffmpeg=4.3=hf484d3e_0
24
+ - filelock=3.13.1=py311h06a4308_0
25
+ - freetype=2.12.1=h4a9f257_0
26
+ - giflib=5.2.1=h5eee18b_3
27
+ - gmp=6.2.1=h295c915_3
28
+ - gmpy2=2.1.2=py311hc9b5ff0_0
29
+ - gnutls=3.6.15=he1e5248_0
30
+ - intel-openmp=2023.1.0=hdb19cb5_46306
31
+ - jinja2=3.1.2=py311h06a4308_0
32
+ - jpeg=9e=h5eee18b_1
33
+ - lame=3.100=h7b6447c_0
34
+ - lcms2=2.12=h3be6417_0
35
+ - ld_impl_linux-64=2.38=h1181459_1
36
+ - lerc=3.0=h295c915_0
37
+ - libcublas=12.1.0.26=0
38
+ - libcufft=11.0.2.4=0
39
+ - libcufile=1.8.1.2=0
40
+ - libcurand=10.3.4.101=0
41
+ - libcusolver=11.4.4.55=0
42
+ - libcusparse=12.0.2.55=0
43
+ - libdeflate=1.17=h5eee18b_1
44
+ - libffi=3.4.4=h6a678d5_0
45
+ - libgcc-ng=11.2.0=h1234567_1
46
+ - libgomp=11.2.0=h1234567_1
47
+ - libiconv=1.16=h7f8727e_2
48
+ - libidn2=2.3.4=h5eee18b_0
49
+ - libjpeg-turbo=2.0.0=h9bf148f_0
50
+ - libnpp=12.0.2.50=0
51
+ - libnvjitlink=12.1.105=0
52
+ - libnvjpeg=12.1.1.14=0
53
+ - libpng=1.6.39=h5eee18b_0
54
+ - libstdcxx-ng=11.2.0=h1234567_1
55
+ - libtasn1=4.19.0=h5eee18b_0
56
+ - libtiff=4.5.1=h6a678d5_0
57
+ - libunistring=0.9.10=h27cfd23_0
58
+ - libuuid=1.41.5=h5eee18b_0
59
+ - libwebp=1.3.2=h11a3e52_0
60
+ - libwebp-base=1.3.2=h5eee18b_0
61
+ - llvm-openmp=14.0.6=h9e868ea_0
62
+ - lz4-c=1.9.4=h6a678d5_0
63
+ - markupsafe=2.1.1=py311h5eee18b_0
64
+ - mkl=2023.1.0=h213fc3f_46344
65
+ - mkl-service=2.4.0=py311h5eee18b_1
66
+ - mkl_fft=1.3.8=py311h5eee18b_0
67
+ - mkl_random=1.2.4=py311hdb19cb5_0
68
+ - mpc=1.1.0=h10f8cd9_1
69
+ - mpfr=4.0.2=hb69a4c5_1
70
+ - mpmath=1.3.0=py311h06a4308_0
71
+ - ncurses=6.4=h6a678d5_0
72
+ - nettle=3.7.3=hbbd107a_1
73
+ - networkx=3.1=py311h06a4308_0
74
+ - numpy=1.26.2=py311h08b1b3b_0
75
+ - numpy-base=1.26.2=py311hf175353_0
76
+ - openh264=2.1.1=h4ff587b_0
77
+ - openjpeg=2.4.0=h3ad879b_0
78
+ - openssl=3.0.12=h7f8727e_0
79
+ - pycparser=2.21=pyhd3eb1b0_0
80
+ - pyopenssl=23.2.0=py311h06a4308_0
81
+ - pysocks=1.7.1=py311h06a4308_0
82
+ - python=3.11.5=h955ad1f_0
83
+ - pytorch-cuda=12.1=ha16c6d3_5
84
+ - pytorch-mutex=1.0=cuda
85
+ - pyyaml=6.0.1=py311h5eee18b_0
86
+ - readline=8.2=h5eee18b_0
87
+ - requests=2.31.0=py311h06a4308_0
88
+ - setuptools=68.2.2=py311h06a4308_0
89
+ - sqlite=3.41.2=h5eee18b_0
90
+ - sympy=1.12=py311h06a4308_0
91
+ - tbb=2021.8.0=hdb19cb5_0
92
+ - tk=8.6.12=h1ccaba5_0
93
+ - wheel=0.41.2=py311h06a4308_0
94
+ - xz=5.4.5=h5eee18b_0
95
+ - yaml=0.2.5=h7b6447c_0
96
+ - zlib=1.2.13=h5eee18b_0
97
+ - zstd=1.5.5=hc292b87_0
98
+ - pip:
99
+ - absl-py==2.0.0
100
+ - accelerate==0.29.3
101
+ - aiohttp==3.9.1
102
+ - aiosignal==1.3.1
103
+ - annotated-types==0.6.0
104
+ - anyio==4.2.0
105
+ - appdirs==1.4.4
106
+ - argon2-cffi==23.1.0
107
+ - argon2-cffi-bindings==21.2.0
108
+ - arrow==1.3.0
109
+ - asttokens==2.4.1
110
+ - astunparse==1.6.3
111
+ - async-lru==2.0.4
112
+ - attrs==23.1.0
113
+ - audioread==3.0.1
114
+ - babel==2.14.0
115
+ - beautifulsoup4==4.12.3
116
+ - bitsandbytes==0.43.1
117
+ - bleach==6.1.0
118
+ - cachetools==5.3.2
119
+ - chardet==5.2.0
120
+ - charset-normalizer==3.3.2
121
+ - click==8.1.7
122
+ - comm==0.2.1
123
+ - datasets==2.18.1.dev0
124
+ - debugpy==1.8.1
125
+ - decorator==5.1.1
126
+ - deepspeed==0.12.2
127
+ - defusedxml==0.7.1
128
+ - dill==0.3.7
129
+ - docker-pycreds==0.4.0
130
+ - docstring-parser==0.15
131
+ - einops==0.7.0
132
+ - evaluate==0.4.0
133
+ - executing==2.0.1
134
+ - fastjsonschema==2.19.1
135
+ - flatbuffers==23.5.26
136
+ - fqdn==1.5.1
137
+ - frozenlist==1.4.1
138
+ - fsspec==2023.10.0
139
+ - gast==0.5.4
140
+ - gitdb==4.0.11
141
+ - gitpython==3.1.40
142
+ - google-auth==2.26.1
143
+ - google-auth-oauthlib==1.2.0
144
+ - google-pasta==0.2.0
145
+ - grpcio==1.60.0
146
+ - h11==0.14.0
147
+ - h5py==3.10.0
148
+ - hf-transfer==0.1.5
149
+ - hjson==3.1.0
150
+ - httpcore==1.0.2
151
+ - httpx==0.26.0
152
+ - huggingface-hub==0.22.2
153
+ - idna==3.6
154
+ - ipdb==0.13.13
155
+ - ipykernel==6.29.2
156
+ - ipython==8.21.0
157
+ - isoduration==20.11.0
158
+ - jedi==0.19.1
159
+ - jiwer==3.0.3
160
+ - joblib==1.3.2
161
+ - json5==0.9.14
162
+ - jsonpointer==2.4
163
+ - jsonschema==4.21.1
164
+ - jsonschema-specifications==2023.12.1
165
+ - jupyter-client==8.6.0
166
+ - jupyter-core==5.7.1
167
+ - jupyter-events==0.9.0
168
+ - jupyter-lsp==2.2.2
169
+ - jupyter-server==2.12.5
170
+ - jupyter-server-terminals==0.5.2
171
+ - jupyterlab==4.1.1
172
+ - jupyterlab-pygments==0.3.0
173
+ - jupyterlab-server==2.25.2
174
+ - keras==2.15.0
175
+ - lazy-loader==0.3
176
+ - libclang==16.0.6
177
+ - librosa==0.10.1
178
+ - llvmlite==0.41.1
179
+ - markdown==3.5.1
180
+ - markdown-it-py==3.0.0
181
+ - matplotlib-inline==0.1.6
182
+ - mdurl==0.1.2
183
+ - mistune==3.0.2
184
+ - ml-dtypes==0.2.0
185
+ - msgpack==1.0.7
186
+ - multidict==6.0.4
187
+ - multiprocess==0.70.15
188
+ - nbclient==0.9.0
189
+ - nbconvert==7.16.0
190
+ - nbformat==5.9.2
191
+ - nest-asyncio==1.6.0
192
+ - ninja==1.11.1.1
193
+ - nltk==3.8.1
194
+ - notebook-shim==0.2.3
195
+ - numba==0.58.1
196
+ - nvidia-cublas-cu12==12.1.3.1
197
+ - nvidia-cuda-cupti-cu12==12.1.105
198
+ - nvidia-cuda-nvrtc-cu12==12.1.105
199
+ - nvidia-cuda-runtime-cu12==12.1.105
200
+ - nvidia-cudnn-cu12==8.9.2.26
201
+ - nvidia-cufft-cu12==11.0.2.54
202
+ - nvidia-curand-cu12==10.3.2.106
203
+ - nvidia-cusolver-cu12==11.4.5.107
204
+ - nvidia-cusparse-cu12==12.1.0.106
205
+ - nvidia-nccl-cu12==2.20.5
206
+ - nvidia-nvjitlink-cu12==12.3.101
207
+ - nvidia-nvtx-cu12==12.1.105
208
+ - oauthlib==3.2.2
209
+ - opt-einsum==3.3.0
210
+ - overrides==7.7.0
211
+ - packaging==23.2
212
+ - pandas==2.1.4
213
+ - pandocfilters==1.5.1
214
+ - parso==0.8.3
215
+ - peft==0.7.1
216
+ - pexpect==4.9.0
217
+ - pillow==10.2.0
218
+ - pip==24.0
219
+ - platformdirs==4.1.0
220
+ - pooch==1.8.0
221
+ - prometheus-client==0.19.0
222
+ - prompt-toolkit==3.0.43
223
+ - protobuf==3.20.2
224
+ - psutil==5.9.7
225
+ - ptyprocess==0.7.0
226
+ - pure-eval==0.2.2
227
+ - py-cpuinfo==9.0.0
228
+ - pyarrow==14.0.2
229
+ - pyarrow-hotfix==0.6
230
+ - pyasn1==0.5.1
231
+ - pyasn1-modules==0.3.0
232
+ - pydantic==2.6.0
233
+ - pydantic-core==2.16.1
234
+ - pygments==2.17.2
235
+ - pynvml==11.5.0
236
+ - python-dateutil==2.8.2
237
+ - python-json-logger==2.0.7
238
+ - pytorch-triton==3.0.0+989adb9a29
239
+ - pytz==2023.3.post1
240
+ - pyzmq==25.1.2
241
+ - rapidfuzz==3.6.1
242
+ - referencing==0.33.0
243
+ - regex==2023.12.25
244
+ - requests-oauthlib==1.3.1
245
+ - responses==0.18.0
246
+ - rfc3339-validator==0.1.4
247
+ - rfc3986-validator==0.1.1
248
+ - rich==13.7.0
249
+ - rpds-py==0.17.1
250
+ - rsa==4.9
251
+ - safetensors==0.4.1
252
+ - scikit-learn==1.3.2
253
+ - scipy==1.11.4
254
+ - send2trash==1.8.2
255
+ - sentencepiece==0.1.99
256
+ - sentry-sdk==1.39.1
257
+ - setproctitle==1.3.3
258
+ - shtab==1.6.5
259
+ - six==1.16.0
260
+ - smmap==5.0.1
261
+ - sniffio==1.3.0
262
+ - soundfile==0.12.1
263
+ - soupsieve==2.5
264
+ - soxr==0.3.7
265
+ - stack-data==0.6.3
266
+ - tensorboard==2.15.1
267
+ - tensorboard-data-server==0.7.2
268
+ - tensorflow-cpu==2.15.0.post1
269
+ - tensorflow-estimator==2.15.0
270
+ - tensorflow-io-gcs-filesystem==0.35.0
271
+ - termcolor==2.4.0
272
+ - terminado==0.18.0
273
+ - threadpoolctl==3.2.0
274
+ - tinycss2==1.2.1
275
+ - tokenizers==0.15.0
276
+ - torch==2.4.0.dev20240323+cu121
277
+ - torchaudio==2.2.0.dev20240323+cu121
278
+ - torchvision==0.19.0.dev20240323+cu121
279
+ - tornado==6.4
280
+ - tqdm==4.66.1
281
+ - traitlets==5.14.1
282
+ - transformers==4.39.0.dev0
283
+ - triton==2.2.0
284
+ - trl==0.8.6
285
+ - types-python-dateutil==2.8.19.20240106
286
+ - typing-extensions==4.10.0
287
+ - tyro==0.7.0
288
+ - tzdata==2023.3
289
+ - uri-template==1.3.0
290
+ - urllib3==2.1.0
291
+ - wandb==0.16.1
292
+ - wcwidth==0.2.13
293
+ - webcolors==1.13
294
+ - webencodings==0.5.1
295
+ - websocket-client==1.7.0
296
+ - werkzeug==3.0.1
297
+ - wrapt==1.14.1
298
+ - xxhash==3.4.1
299
+ - yarl==1.9.4
300
+ prefix: /fsx/sanchit/miniconda3/envs/venv
wandb/run-20240424_154339-mwp0iutr/files/config.yaml ADDED
@@ -0,0 +1,663 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.11.5
7
+ cli_version: 0.16.1
8
+ framework: huggingface
9
+ huggingface_version: 4.40.0.dev0
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1713973419.470656
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 2
17
+ - 3
18
+ - 5
19
+ - 11
20
+ - 49
21
+ - 51
22
+ - 53
23
+ - 55
24
+ - 71
25
+ - 84
26
+ - 98
27
+ 2:
28
+ - 1
29
+ - 2
30
+ - 3
31
+ - 5
32
+ - 11
33
+ - 49
34
+ - 51
35
+ - 53
36
+ - 55
37
+ - 71
38
+ - 84
39
+ - 98
40
+ 3:
41
+ - 7
42
+ - 23
43
+ 4: 3.11.5
44
+ 5: 0.16.1
45
+ 6: 4.40.0.dev0
46
+ 8:
47
+ - 5
48
+ 9:
49
+ 1: transformers_trainer
50
+ 13: linux-x86_64
51
+ m:
52
+ - 1: train/global_step
53
+ 6:
54
+ - 3
55
+ - 1: train/loss
56
+ 5: 1
57
+ 6:
58
+ - 1
59
+ - 1: train/grad_norm
60
+ 5: 1
61
+ 6:
62
+ - 1
63
+ - 1: train/learning_rate
64
+ 5: 1
65
+ 6:
66
+ - 1
67
+ - 1: train/epoch
68
+ 5: 1
69
+ 6:
70
+ - 1
71
+ vocab_size:
72
+ desc: null
73
+ value: 32000
74
+ max_position_embeddings:
75
+ desc: null
76
+ value: 32768
77
+ hidden_size:
78
+ desc: null
79
+ value: 4096
80
+ intermediate_size:
81
+ desc: null
82
+ value: 14336
83
+ num_hidden_layers:
84
+ desc: null
85
+ value: 6
86
+ num_attention_heads:
87
+ desc: null
88
+ value: 32
89
+ sliding_window:
90
+ desc: null
91
+ value: 4096
92
+ num_key_value_heads:
93
+ desc: null
94
+ value: 8
95
+ hidden_act:
96
+ desc: null
97
+ value: silu
98
+ initializer_range:
99
+ desc: null
100
+ value: 0.02
101
+ rms_norm_eps:
102
+ desc: null
103
+ value: 1.0e-05
104
+ use_cache:
105
+ desc: null
106
+ value: false
107
+ rope_theta:
108
+ desc: null
109
+ value: 10000.0
110
+ attention_dropout:
111
+ desc: null
112
+ value: 0.0
113
+ return_dict:
114
+ desc: null
115
+ value: true
116
+ output_hidden_states:
117
+ desc: null
118
+ value: false
119
+ output_attentions:
120
+ desc: null
121
+ value: false
122
+ torchscript:
123
+ desc: null
124
+ value: false
125
+ torch_dtype:
126
+ desc: null
127
+ value: bfloat16
128
+ use_bfloat16:
129
+ desc: null
130
+ value: false
131
+ tf_legacy_loss:
132
+ desc: null
133
+ value: false
134
+ pruned_heads:
135
+ desc: null
136
+ value: {}
137
+ tie_word_embeddings:
138
+ desc: null
139
+ value: false
140
+ chunk_size_feed_forward:
141
+ desc: null
142
+ value: 0
143
+ is_encoder_decoder:
144
+ desc: null
145
+ value: false
146
+ is_decoder:
147
+ desc: null
148
+ value: false
149
+ cross_attention_hidden_size:
150
+ desc: null
151
+ value: null
152
+ add_cross_attention:
153
+ desc: null
154
+ value: false
155
+ tie_encoder_decoder:
156
+ desc: null
157
+ value: false
158
+ max_length:
159
+ desc: null
160
+ value: 20
161
+ min_length:
162
+ desc: null
163
+ value: 0
164
+ do_sample:
165
+ desc: null
166
+ value: false
167
+ early_stopping:
168
+ desc: null
169
+ value: false
170
+ num_beams:
171
+ desc: null
172
+ value: 1
173
+ num_beam_groups:
174
+ desc: null
175
+ value: 1
176
+ diversity_penalty:
177
+ desc: null
178
+ value: 0.0
179
+ temperature:
180
+ desc: null
181
+ value: 1.0
182
+ top_k:
183
+ desc: null
184
+ value: 50
185
+ top_p:
186
+ desc: null
187
+ value: 1.0
188
+ typical_p:
189
+ desc: null
190
+ value: 1.0
191
+ repetition_penalty:
192
+ desc: null
193
+ value: 1.0
194
+ length_penalty:
195
+ desc: null
196
+ value: 1.0
197
+ no_repeat_ngram_size:
198
+ desc: null
199
+ value: 0
200
+ encoder_no_repeat_ngram_size:
201
+ desc: null
202
+ value: 0
203
+ bad_words_ids:
204
+ desc: null
205
+ value: null
206
+ num_return_sequences:
207
+ desc: null
208
+ value: 1
209
+ output_scores:
210
+ desc: null
211
+ value: false
212
+ return_dict_in_generate:
213
+ desc: null
214
+ value: false
215
+ forced_bos_token_id:
216
+ desc: null
217
+ value: null
218
+ forced_eos_token_id:
219
+ desc: null
220
+ value: null
221
+ remove_invalid_values:
222
+ desc: null
223
+ value: false
224
+ exponential_decay_length_penalty:
225
+ desc: null
226
+ value: null
227
+ suppress_tokens:
228
+ desc: null
229
+ value: null
230
+ begin_suppress_tokens:
231
+ desc: null
232
+ value: null
233
+ architectures:
234
+ desc: null
235
+ value:
236
+ - MistralForCausalLM
237
+ finetuning_task:
238
+ desc: null
239
+ value: null
240
+ id2label:
241
+ desc: null
242
+ value:
243
+ '0': LABEL_0
244
+ '1': LABEL_1
245
+ label2id:
246
+ desc: null
247
+ value:
248
+ LABEL_0: 0
249
+ LABEL_1: 1
250
+ tokenizer_class:
251
+ desc: null
252
+ value: null
253
+ prefix:
254
+ desc: null
255
+ value: null
256
+ bos_token_id:
257
+ desc: null
258
+ value: 1
259
+ pad_token_id:
260
+ desc: null
261
+ value: null
262
+ eos_token_id:
263
+ desc: null
264
+ value: 2
265
+ sep_token_id:
266
+ desc: null
267
+ value: null
268
+ decoder_start_token_id:
269
+ desc: null
270
+ value: null
271
+ task_specific_params:
272
+ desc: null
273
+ value: null
274
+ problem_type:
275
+ desc: null
276
+ value: null
277
+ _name_or_path:
278
+ desc: null
279
+ value: sanchit-gandhi/Mistral-7B-v0.1-6-layer
280
+ transformers_version:
281
+ desc: null
282
+ value: 4.40.0.dev0
283
+ model_type:
284
+ desc: null
285
+ value: mistral
286
+ output_dir:
287
+ desc: null
288
+ value: ./
289
+ overwrite_output_dir:
290
+ desc: null
291
+ value: true
292
+ do_train:
293
+ desc: null
294
+ value: false
295
+ do_eval:
296
+ desc: null
297
+ value: true
298
+ do_predict:
299
+ desc: null
300
+ value: false
301
+ evaluation_strategy:
302
+ desc: null
303
+ value: steps
304
+ prediction_loss_only:
305
+ desc: null
306
+ value: false
307
+ per_device_train_batch_size:
308
+ desc: null
309
+ value: 64
310
+ per_device_eval_batch_size:
311
+ desc: null
312
+ value: 32
313
+ per_gpu_train_batch_size:
314
+ desc: null
315
+ value: null
316
+ per_gpu_eval_batch_size:
317
+ desc: null
318
+ value: null
319
+ gradient_accumulation_steps:
320
+ desc: null
321
+ value: 1
322
+ eval_accumulation_steps:
323
+ desc: null
324
+ value: null
325
+ eval_delay:
326
+ desc: null
327
+ value: 0
328
+ learning_rate:
329
+ desc: null
330
+ value: 0.0001
331
+ weight_decay:
332
+ desc: null
333
+ value: 0.0
334
+ adam_beta1:
335
+ desc: null
336
+ value: 0.9
337
+ adam_beta2:
338
+ desc: null
339
+ value: 0.999
340
+ adam_epsilon:
341
+ desc: null
342
+ value: 1.0e-08
343
+ max_grad_norm:
344
+ desc: null
345
+ value: 1.0
346
+ num_train_epochs:
347
+ desc: null
348
+ value: 3.0
349
+ max_steps:
350
+ desc: null
351
+ value: 20000
352
+ lr_scheduler_type:
353
+ desc: null
354
+ value: linear
355
+ lr_scheduler_kwargs:
356
+ desc: null
357
+ value: {}
358
+ warmup_ratio:
359
+ desc: null
360
+ value: 0.0
361
+ warmup_steps:
362
+ desc: null
363
+ value: 500
364
+ log_level:
365
+ desc: null
366
+ value: info
367
+ log_level_replica:
368
+ desc: null
369
+ value: warning
370
+ log_on_each_node:
371
+ desc: null
372
+ value: true
373
+ logging_dir:
374
+ desc: null
375
+ value: ./runs/Apr24_14-23-38_ip-26-0-162-233
376
+ logging_strategy:
377
+ desc: null
378
+ value: steps
379
+ logging_first_step:
380
+ desc: null
381
+ value: true
382
+ logging_steps:
383
+ desc: null
384
+ value: 25
385
+ logging_nan_inf_filter:
386
+ desc: null
387
+ value: true
388
+ save_strategy:
389
+ desc: null
390
+ value: steps
391
+ save_steps:
392
+ desc: null
393
+ value: 500
394
+ save_total_limit:
395
+ desc: null
396
+ value: 5000
397
+ save_safetensors:
398
+ desc: null
399
+ value: true
400
+ save_on_each_node:
401
+ desc: null
402
+ value: false
403
+ save_only_model:
404
+ desc: null
405
+ value: false
406
+ no_cuda:
407
+ desc: null
408
+ value: false
409
+ use_cpu:
410
+ desc: null
411
+ value: false
412
+ use_mps_device:
413
+ desc: null
414
+ value: false
415
+ seed:
416
+ desc: null
417
+ value: 42
418
+ data_seed:
419
+ desc: null
420
+ value: null
421
+ jit_mode_eval:
422
+ desc: null
423
+ value: false
424
+ use_ipex:
425
+ desc: null
426
+ value: false
427
+ bf16:
428
+ desc: null
429
+ value: true
430
+ fp16:
431
+ desc: null
432
+ value: false
433
+ fp16_opt_level:
434
+ desc: null
435
+ value: O1
436
+ half_precision_backend:
437
+ desc: null
438
+ value: auto
439
+ bf16_full_eval:
440
+ desc: null
441
+ value: false
442
+ fp16_full_eval:
443
+ desc: null
444
+ value: false
445
+ tf32:
446
+ desc: null
447
+ value: null
448
+ local_rank:
449
+ desc: null
450
+ value: 0
451
+ ddp_backend:
452
+ desc: null
453
+ value: null
454
+ tpu_num_cores:
455
+ desc: null
456
+ value: null
457
+ tpu_metrics_debug:
458
+ desc: null
459
+ value: false
460
+ debug:
461
+ desc: null
462
+ value: []
463
+ dataloader_drop_last:
464
+ desc: null
465
+ value: false
466
+ eval_steps:
467
+ desc: null
468
+ value: 5000
469
+ dataloader_num_workers:
470
+ desc: null
471
+ value: 0
472
+ dataloader_prefetch_factor:
473
+ desc: null
474
+ value: null
475
+ past_index:
476
+ desc: null
477
+ value: -1
478
+ run_name:
479
+ desc: null
480
+ value: ./
481
+ disable_tqdm:
482
+ desc: null
483
+ value: false
484
+ remove_unused_columns:
485
+ desc: null
486
+ value: true
487
+ label_names:
488
+ desc: null
489
+ value: null
490
+ load_best_model_at_end:
491
+ desc: null
492
+ value: false
493
+ metric_for_best_model:
494
+ desc: null
495
+ value: null
496
+ greater_is_better:
497
+ desc: null
498
+ value: null
499
+ ignore_data_skip:
500
+ desc: null
501
+ value: false
502
+ fsdp:
503
+ desc: null
504
+ value: []
505
+ fsdp_min_num_params:
506
+ desc: null
507
+ value: 0
508
+ fsdp_config:
509
+ desc: null
510
+ value:
511
+ min_num_params: 0
512
+ xla: false
513
+ xla_fsdp_v2: false
514
+ xla_fsdp_grad_ckpt: false
515
+ fsdp_transformer_layer_cls_to_wrap:
516
+ desc: null
517
+ value: null
518
+ accelerator_config:
519
+ desc: null
520
+ value:
521
+ split_batches: false
522
+ dispatch_batches: null
523
+ even_batches: true
524
+ use_seedable_sampler: true
525
+ gradient_accumulation_kwargs: null
526
+ deepspeed:
527
+ desc: null
528
+ value: null
529
+ label_smoothing_factor:
530
+ desc: null
531
+ value: 0.0
532
+ optim:
533
+ desc: null
534
+ value: adamw_torch
535
+ optim_args:
536
+ desc: null
537
+ value: null
538
+ adafactor:
539
+ desc: null
540
+ value: false
541
+ group_by_length:
542
+ desc: null
543
+ value: false
544
+ length_column_name:
545
+ desc: null
546
+ value: length
547
+ report_to:
548
+ desc: null
549
+ value:
550
+ - tensorboard
551
+ - wandb
552
+ ddp_find_unused_parameters:
553
+ desc: null
554
+ value: null
555
+ ddp_bucket_cap_mb:
556
+ desc: null
557
+ value: null
558
+ ddp_broadcast_buffers:
559
+ desc: null
560
+ value: null
561
+ dataloader_pin_memory:
562
+ desc: null
563
+ value: true
564
+ dataloader_persistent_workers:
565
+ desc: null
566
+ value: false
567
+ skip_memory_metrics:
568
+ desc: null
569
+ value: true
570
+ use_legacy_prediction_loop:
571
+ desc: null
572
+ value: false
573
+ push_to_hub:
574
+ desc: null
575
+ value: true
576
+ resume_from_checkpoint:
577
+ desc: null
578
+ value: null
579
+ hub_model_id:
580
+ desc: null
581
+ value: null
582
+ hub_strategy:
583
+ desc: null
584
+ value: every_save
585
+ hub_token:
586
+ desc: null
587
+ value: <HUB_TOKEN>
588
+ hub_private_repo:
589
+ desc: null
590
+ value: false
591
+ hub_always_push:
592
+ desc: null
593
+ value: false
594
+ gradient_checkpointing:
595
+ desc: null
596
+ value: true
597
+ gradient_checkpointing_kwargs:
598
+ desc: null
599
+ value:
600
+ use_reentrant: false
601
+ include_inputs_for_metrics:
602
+ desc: null
603
+ value: false
604
+ fp16_backend:
605
+ desc: null
606
+ value: auto
607
+ push_to_hub_model_id:
608
+ desc: null
609
+ value: null
610
+ push_to_hub_organization:
611
+ desc: null
612
+ value: null
613
+ push_to_hub_token:
614
+ desc: null
615
+ value: <PUSH_TO_HUB_TOKEN>
616
+ mp_parameters:
617
+ desc: null
618
+ value: ''
619
+ auto_find_batch_size:
620
+ desc: null
621
+ value: false
622
+ full_determinism:
623
+ desc: null
624
+ value: false
625
+ torchdynamo:
626
+ desc: null
627
+ value: null
628
+ ray_scope:
629
+ desc: null
630
+ value: last
631
+ ddp_timeout:
632
+ desc: null
633
+ value: 7200
634
+ torch_compile:
635
+ desc: null
636
+ value: false
637
+ torch_compile_backend:
638
+ desc: null
639
+ value: null
640
+ torch_compile_mode:
641
+ desc: null
642
+ value: null
643
+ dispatch_batches:
644
+ desc: null
645
+ value: null
646
+ split_batches:
647
+ desc: null
648
+ value: null
649
+ include_tokens_per_second:
650
+ desc: null
651
+ value: false
652
+ include_num_input_tokens_seen:
653
+ desc: null
654
+ value: false
655
+ neftune_noise_alpha:
656
+ desc: null
657
+ value: null
658
+ optim_target_modules:
659
+ desc: null
660
+ value: null
661
+ max_seq_length:
662
+ desc: null
663
+ value: 2048
wandb/run-20240424_154339-mwp0iutr/files/output.log ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0%| | 0/20000 [00:00<?, ?it/s]/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
2
+ warnings.warn(
3
+ 0%| | 1/20000 [00:06<38:19:46, 6.90s/it]
4
+ 0%| | 1/20000 [00:06<38:19:46, 6.90s/it]Traceback (most recent call last):
5
+ File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 217, in <module>
6
+ main()
7
+ File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 172, in main
8
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
9
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
10
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 361, in train
11
+ output = super().train(*args, **kwargs)
12
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
13
+ File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 1849, in train
14
+ return inner_training_loop(
15
+ ^^^^^^^^^^^^^^^^^^^^
16
+ File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 2202, in _inner_training_loop
17
+ tr_loss_step = self.training_step(model, inputs)
18
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
19
+ File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3137, in training_step
20
+ loss = self.compute_loss(model, inputs)
21
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
22
+ File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3160, in compute_loss
23
+ outputs = model(**inputs)
24
+ ^^^^^^^^^^^^^^^
25
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
26
+ return self._call_impl(*args, **kwargs)
27
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
28
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
29
+ return forward_call(*args, **kwargs)
30
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
31
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1608, in forward
32
+ else self._run_ddp_forward(*inputs, **kwargs)
33
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
34
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1426, in _run_ddp_forward
35
+ return self.module(*inputs, **kwargs) # type: ignore[index]
36
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
37
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
38
+ return self._call_impl(*args, **kwargs)
39
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
40
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
41
+ return forward_call(*args, **kwargs)
42
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
43
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 825, in forward
44
+ return model_forward(*args, **kwargs)
45
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
46
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 813, in __call__
47
+ return convert_to_fp32(self.model_forward(*args, **kwargs))
48
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
49
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
50
+ return func(*args, **kwargs)
51
+ ^^^^^^^^^^^^^^^^^^^^^
52
+ File "/fsx/sanchit/transformers/src/transformers/models/mistral/modeling_mistral.py", line 1184, in forward
53
+ loss = loss_fct(shift_logits, shift_labels)
54
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
55
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
56
+ return self._call_impl(*args, **kwargs)
57
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
58
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
59
+ return forward_call(*args, **kwargs)
60
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
61
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1185, in forward
62
+ return F.cross_entropy(input, target, weight=self.weight,
63
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
64
+ File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/functional.py", line 3088, in cross_entropy
65
+ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
66
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
67
+ torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.62 GiB. GPU
68
+ [rank0]: Traceback (most recent call last):
69
+ [rank0]: File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 217, in <module>
70
+ [rank0]: main()
71
+ [rank0]: File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 172, in main
72
+ [rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint)
73
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
74
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 361, in train
75
+ [rank0]: output = super().train(*args, **kwargs)
76
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
77
+ [rank0]: File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 1849, in train
78
+ [rank0]: return inner_training_loop(
79
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^
80
+ [rank0]: File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 2202, in _inner_training_loop
81
+ [rank0]: tr_loss_step = self.training_step(model, inputs)
82
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
83
+ [rank0]: File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3137, in training_step
84
+ [rank0]: loss = self.compute_loss(model, inputs)
85
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
86
+ [rank0]: File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3160, in compute_loss
87
+ [rank0]: outputs = model(**inputs)
88
+ [rank0]: ^^^^^^^^^^^^^^^
89
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
90
+ [rank0]: return self._call_impl(*args, **kwargs)
91
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
92
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
93
+ [rank0]: return forward_call(*args, **kwargs)
94
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
95
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1608, in forward
96
+ [rank0]: else self._run_ddp_forward(*inputs, **kwargs)
97
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
98
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1426, in _run_ddp_forward
99
+ [rank0]: return self.module(*inputs, **kwargs) # type: ignore[index]
100
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
101
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
102
+ [rank0]: return self._call_impl(*args, **kwargs)
103
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
104
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
105
+ [rank0]: return forward_call(*args, **kwargs)
106
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
107
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 825, in forward
108
+ [rank0]: return model_forward(*args, **kwargs)
109
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
110
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 813, in __call__
111
+ [rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs))
112
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
113
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
114
+ [rank0]: return func(*args, **kwargs)
115
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
116
+ [rank0]: File "/fsx/sanchit/transformers/src/transformers/models/mistral/modeling_mistral.py", line 1184, in forward
117
+ [rank0]: loss = loss_fct(shift_logits, shift_labels)
118
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
119
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
120
+ [rank0]: return self._call_impl(*args, **kwargs)
121
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
122
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
123
+ [rank0]: return forward_call(*args, **kwargs)
124
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
125
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1185, in forward
126
+ [rank0]: return F.cross_entropy(input, target, weight=self.weight,
127
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
128
+ [rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/functional.py", line 3088, in cross_entropy
129
+ [rank0]: return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
130
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
131
+ [rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.62 GiB. GPU
wandb/run-20240424_154339-mwp0iutr/files/requirements.txt ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ accelerate==0.29.3
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ anyio==4.2.0
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ arrow==1.3.0
11
+ asttokens==2.4.1
12
+ astunparse==1.6.3
13
+ async-lru==2.0.4
14
+ attrs==23.1.0
15
+ audioread==3.0.1
16
+ babel==2.14.0
17
+ beautifulsoup4==4.12.3
18
+ bitsandbytes==0.43.1
19
+ bleach==6.1.0
20
+ brotli==1.0.9
21
+ cachetools==5.3.2
22
+ certifi==2023.11.17
23
+ cffi==1.16.0
24
+ chardet==5.2.0
25
+ charset-normalizer==2.0.4
26
+ click==8.1.7
27
+ comm==0.2.1
28
+ cryptography==41.0.7
29
+ datasets==2.18.1.dev0
30
+ debugpy==1.8.1
31
+ decorator==5.1.1
32
+ deepspeed==0.12.2
33
+ defusedxml==0.7.1
34
+ dill==0.3.7
35
+ docker-pycreds==0.4.0
36
+ docstring-parser==0.15
37
+ einops==0.7.0
38
+ evaluate==0.4.0
39
+ executing==2.0.1
40
+ fastjsonschema==2.19.1
41
+ filelock==3.13.1
42
+ flatbuffers==23.5.26
43
+ fqdn==1.5.1
44
+ frozenlist==1.4.1
45
+ fsspec==2023.10.0
46
+ gast==0.5.4
47
+ gitdb==4.0.11
48
+ gitpython==3.1.40
49
+ gmpy2==2.1.2
50
+ google-auth-oauthlib==1.2.0
51
+ google-auth==2.26.1
52
+ google-pasta==0.2.0
53
+ grpcio==1.60.0
54
+ h11==0.14.0
55
+ h5py==3.10.0
56
+ hf-transfer==0.1.5
57
+ hjson==3.1.0
58
+ httpcore==1.0.2
59
+ httpx==0.26.0
60
+ huggingface-hub==0.22.2
61
+ idna==3.4
62
+ ipdb==0.13.13
63
+ ipykernel==6.29.2
64
+ ipython==8.21.0
65
+ isoduration==20.11.0
66
+ jedi==0.19.1
67
+ jinja2==3.1.2
68
+ jiwer==3.0.3
69
+ joblib==1.3.2
70
+ json5==0.9.14
71
+ jsonpointer==2.4
72
+ jsonschema-specifications==2023.12.1
73
+ jsonschema==4.21.1
74
+ jupyter-client==8.6.0
75
+ jupyter-core==5.7.1
76
+ jupyter-events==0.9.0
77
+ jupyter-lsp==2.2.2
78
+ jupyter-server-terminals==0.5.2
79
+ jupyter-server==2.12.5
80
+ jupyterlab-pygments==0.3.0
81
+ jupyterlab-server==2.25.2
82
+ jupyterlab==4.1.1
83
+ keras==2.15.0
84
+ lazy-loader==0.3
85
+ libclang==16.0.6
86
+ librosa==0.10.1
87
+ llvmlite==0.41.1
88
+ markdown-it-py==3.0.0
89
+ markdown==3.5.1
90
+ markupsafe==2.1.1
91
+ matplotlib-inline==0.1.6
92
+ mdurl==0.1.2
93
+ mistune==3.0.2
94
+ mkl-fft==1.3.8
95
+ mkl-random==1.2.4
96
+ mkl-service==2.4.0
97
+ ml-dtypes==0.2.0
98
+ mpmath==1.3.0
99
+ msgpack==1.0.7
100
+ multidict==6.0.4
101
+ multiprocess==0.70.15
102
+ nbclient==0.9.0
103
+ nbconvert==7.16.0
104
+ nbformat==5.9.2
105
+ nest-asyncio==1.6.0
106
+ networkx==3.1
107
+ ninja==1.11.1.1
108
+ nltk==3.8.1
109
+ notebook-shim==0.2.3
110
+ numba==0.58.1
111
+ numpy==1.26.2
112
+ nvidia-cublas-cu12==12.1.3.1
113
+ nvidia-cuda-cupti-cu12==12.1.105
114
+ nvidia-cuda-nvrtc-cu12==12.1.105
115
+ nvidia-cuda-runtime-cu12==12.1.105
116
+ nvidia-cudnn-cu12==8.9.2.26
117
+ nvidia-cufft-cu12==11.0.2.54
118
+ nvidia-curand-cu12==10.3.2.106
119
+ nvidia-cusolver-cu12==11.4.5.107
120
+ nvidia-cusparse-cu12==12.1.0.106
121
+ nvidia-nccl-cu12==2.20.5
122
+ nvidia-nvjitlink-cu12==12.3.101
123
+ nvidia-nvtx-cu12==12.1.105
124
+ oauthlib==3.2.2
125
+ opt-einsum==3.3.0
126
+ overrides==7.7.0
127
+ packaging==23.2
128
+ pandas==2.1.4
129
+ pandocfilters==1.5.1
130
+ parso==0.8.3
131
+ peft==0.7.1
132
+ pexpect==4.9.0
133
+ pillow==10.2.0
134
+ pip==24.0
135
+ platformdirs==4.1.0
136
+ pooch==1.8.0
137
+ prometheus-client==0.19.0
138
+ prompt-toolkit==3.0.43
139
+ protobuf==3.20.2
140
+ psutil==5.9.7
141
+ ptyprocess==0.7.0
142
+ pure-eval==0.2.2
143
+ py-cpuinfo==9.0.0
144
+ pyarrow-hotfix==0.6
145
+ pyarrow==14.0.2
146
+ pyasn1-modules==0.3.0
147
+ pyasn1==0.5.1
148
+ pycparser==2.21
149
+ pydantic-core==2.16.1
150
+ pydantic==2.6.0
151
+ pygments==2.17.2
152
+ pynvml==11.5.0
153
+ pyopenssl==23.2.0
154
+ pysocks==1.7.1
155
+ python-dateutil==2.8.2
156
+ python-json-logger==2.0.7
157
+ pytorch-triton==3.0.0+989adb9a29
158
+ pytz==2023.3.post1
159
+ pyyaml==6.0.1
160
+ pyzmq==25.1.2
161
+ rapidfuzz==3.6.1
162
+ referencing==0.33.0
163
+ regex==2023.12.25
164
+ requests-oauthlib==1.3.1
165
+ requests==2.31.0
166
+ responses==0.18.0
167
+ rfc3339-validator==0.1.4
168
+ rfc3986-validator==0.1.1
169
+ rich==13.7.0
170
+ rpds-py==0.17.1
171
+ rsa==4.9
172
+ safetensors==0.4.1
173
+ scikit-learn==1.3.2
174
+ scipy==1.11.4
175
+ send2trash==1.8.2
176
+ sentencepiece==0.1.99
177
+ sentry-sdk==1.39.1
178
+ setproctitle==1.3.3
179
+ setuptools==68.2.2
180
+ shtab==1.6.5
181
+ six==1.16.0
182
+ smmap==5.0.1
183
+ sniffio==1.3.0
184
+ soundfile==0.12.1
185
+ soupsieve==2.5
186
+ soxr==0.3.7
187
+ stack-data==0.6.3
188
+ sympy==1.12
189
+ tensorboard-data-server==0.7.2
190
+ tensorboard==2.15.1
191
+ tensorflow-cpu==2.15.0.post1
192
+ tensorflow-estimator==2.15.0
193
+ tensorflow-io-gcs-filesystem==0.35.0
194
+ termcolor==2.4.0
195
+ terminado==0.18.0
196
+ threadpoolctl==3.2.0
197
+ tinycss2==1.2.1
198
+ tokenizers==0.15.0
199
+ torch==2.4.0.dev20240323+cu121
200
+ torchaudio==2.2.0.dev20240323+cu121
201
+ torchvision==0.19.0.dev20240323+cu121
202
+ tornado==6.4
203
+ tqdm==4.66.1
204
+ traitlets==5.14.1
205
+ transformers==4.39.0.dev0
206
+ triton==2.2.0
207
+ trl==0.8.6
208
+ types-python-dateutil==2.8.19.20240106
209
+ typing-extensions==4.10.0
210
+ tyro==0.7.0
211
+ tzdata==2023.3
212
+ uri-template==1.3.0
213
+ urllib3==1.26.18
214
+ wandb==0.16.1
215
+ wcwidth==0.2.13
216
+ webcolors==1.13
217
+ webencodings==0.5.1
218
+ websocket-client==1.7.0
219
+ werkzeug==3.0.1
220
+ wheel==0.41.2
221
+ wrapt==1.14.1
222
+ xxhash==3.4.1
223
+ yarl==1.9.4
wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
3
+ "python": "3.11.5",
4
+ "heartbeatAt": "2024-04-24T15:43:39.965097",
5
+ "startedAt": "2024-04-24T15:43:39.449266",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "config_full.yaml"
10
+ ],
11
+ "state": "running",
12
+ "program": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py",
13
+ "codePathLocal": "run_sft.py",
14
+ "codePath": "run_sft.py",
15
+ "git": {
16
+ "remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat",
17
+ "commit": "cbea69c6b95c970317a1e47c3f614b55b33f8ed9"
18
+ },
19
+ "email": null,
20
+ "root": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat",
21
+ "host": "ip-26-0-162-233",
22
+ "username": "sanchit",
23
+ "executable": "/fsx/sanchit/miniconda3/envs/venv/bin/python",
24
+ "cpu_count": 96,
25
+ "cpu_count_logical": 96,
26
+ "cpu_freq": {
27
+ "current": 2721.9698645833337,
28
+ "min": 0.0,
29
+ "max": 0.0
30
+ },
31
+ "cpu_freq_per_core": [
32
+ {
33
+ "current": 3590.538,
34
+ "min": 0.0,
35
+ "max": 0.0
36
+ },
37
+ {
38
+ "current": 2650.0,
39
+ "min": 0.0,
40
+ "max": 0.0
41
+ },
42
+ {
43
+ "current": 2650.0,
44
+ "min": 0.0,
45
+ "max": 0.0
46
+ },
47
+ {
48
+ "current": 2650.0,
49
+ "min": 0.0,
50
+ "max": 0.0
51
+ },
52
+ {
53
+ "current": 2650.0,
54
+ "min": 0.0,
55
+ "max": 0.0
56
+ },
57
+ {
58
+ "current": 2650.0,
59
+ "min": 0.0,
60
+ "max": 0.0
61
+ },
62
+ {
63
+ "current": 2650.0,
64
+ "min": 0.0,
65
+ "max": 0.0
66
+ },
67
+ {
68
+ "current": 2650.0,
69
+ "min": 0.0,
70
+ "max": 0.0
71
+ },
72
+ {
73
+ "current": 2650.0,
74
+ "min": 0.0,
75
+ "max": 0.0
76
+ },
77
+ {
78
+ "current": 2650.0,
79
+ "min": 0.0,
80
+ "max": 0.0
81
+ },
82
+ {
83
+ "current": 2650.0,
84
+ "min": 0.0,
85
+ "max": 0.0
86
+ },
87
+ {
88
+ "current": 2650.0,
89
+ "min": 0.0,
90
+ "max": 0.0
91
+ },
92
+ {
93
+ "current": 2650.0,
94
+ "min": 0.0,
95
+ "max": 0.0
96
+ },
97
+ {
98
+ "current": 2650.0,
99
+ "min": 0.0,
100
+ "max": 0.0
101
+ },
102
+ {
103
+ "current": 3595.996,
104
+ "min": 0.0,
105
+ "max": 0.0
106
+ },
107
+ {
108
+ "current": 2650.0,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2650.0,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 2650.0,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2650.0,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2650.0,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2650.0,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2650.0,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2650.0,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 3597.59,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 2650.0,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 3399.936,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2650.0,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 2650.0,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2650.0,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 3598.273,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2650.0,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2650.0,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2650.0,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2650.0,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ },
202
+ {
203
+ "current": 2650.0,
204
+ "min": 0.0,
205
+ "max": 0.0
206
+ },
207
+ {
208
+ "current": 2650.0,
209
+ "min": 0.0,
210
+ "max": 0.0
211
+ },
212
+ {
213
+ "current": 2650.0,
214
+ "min": 0.0,
215
+ "max": 0.0
216
+ },
217
+ {
218
+ "current": 3597.284,
219
+ "min": 0.0,
220
+ "max": 0.0
221
+ },
222
+ {
223
+ "current": 3036.337,
224
+ "min": 0.0,
225
+ "max": 0.0
226
+ },
227
+ {
228
+ "current": 2650.0,
229
+ "min": 0.0,
230
+ "max": 0.0
231
+ },
232
+ {
233
+ "current": 3597.887,
234
+ "min": 0.0,
235
+ "max": 0.0
236
+ },
237
+ {
238
+ "current": 2650.0,
239
+ "min": 0.0,
240
+ "max": 0.0
241
+ },
242
+ {
243
+ "current": 3598.442,
244
+ "min": 0.0,
245
+ "max": 0.0
246
+ },
247
+ {
248
+ "current": 2650.0,
249
+ "min": 0.0,
250
+ "max": 0.0
251
+ },
252
+ {
253
+ "current": 2650.0,
254
+ "min": 0.0,
255
+ "max": 0.0
256
+ },
257
+ {
258
+ "current": 2650.0,
259
+ "min": 0.0,
260
+ "max": 0.0
261
+ },
262
+ {
263
+ "current": 2650.0,
264
+ "min": 0.0,
265
+ "max": 0.0
266
+ },
267
+ {
268
+ "current": 2650.0,
269
+ "min": 0.0,
270
+ "max": 0.0
271
+ },
272
+ {
273
+ "current": 2650.0,
274
+ "min": 0.0,
275
+ "max": 0.0
276
+ },
277
+ {
278
+ "current": 2650.0,
279
+ "min": 0.0,
280
+ "max": 0.0
281
+ },
282
+ {
283
+ "current": 2650.0,
284
+ "min": 0.0,
285
+ "max": 0.0
286
+ },
287
+ {
288
+ "current": 2650.0,
289
+ "min": 0.0,
290
+ "max": 0.0
291
+ },
292
+ {
293
+ "current": 2650.0,
294
+ "min": 0.0,
295
+ "max": 0.0
296
+ },
297
+ {
298
+ "current": 2650.0,
299
+ "min": 0.0,
300
+ "max": 0.0
301
+ },
302
+ {
303
+ "current": 2650.0,
304
+ "min": 0.0,
305
+ "max": 0.0
306
+ },
307
+ {
308
+ "current": 2650.0,
309
+ "min": 0.0,
310
+ "max": 0.0
311
+ },
312
+ {
313
+ "current": 2650.0,
314
+ "min": 0.0,
315
+ "max": 0.0
316
+ },
317
+ {
318
+ "current": 2650.0,
319
+ "min": 0.0,
320
+ "max": 0.0
321
+ },
322
+ {
323
+ "current": 2650.0,
324
+ "min": 0.0,
325
+ "max": 0.0
326
+ },
327
+ {
328
+ "current": 2650.0,
329
+ "min": 0.0,
330
+ "max": 0.0
331
+ },
332
+ {
333
+ "current": 2650.0,
334
+ "min": 0.0,
335
+ "max": 0.0
336
+ },
337
+ {
338
+ "current": 2650.0,
339
+ "min": 0.0,
340
+ "max": 0.0
341
+ },
342
+ {
343
+ "current": 2650.0,
344
+ "min": 0.0,
345
+ "max": 0.0
346
+ },
347
+ {
348
+ "current": 2650.0,
349
+ "min": 0.0,
350
+ "max": 0.0
351
+ },
352
+ {
353
+ "current": 2650.0,
354
+ "min": 0.0,
355
+ "max": 0.0
356
+ },
357
+ {
358
+ "current": 2650.0,
359
+ "min": 0.0,
360
+ "max": 0.0
361
+ },
362
+ {
363
+ "current": 2650.0,
364
+ "min": 0.0,
365
+ "max": 0.0
366
+ },
367
+ {
368
+ "current": 2650.0,
369
+ "min": 0.0,
370
+ "max": 0.0
371
+ },
372
+ {
373
+ "current": 2650.0,
374
+ "min": 0.0,
375
+ "max": 0.0
376
+ },
377
+ {
378
+ "current": 2650.0,
379
+ "min": 0.0,
380
+ "max": 0.0
381
+ },
382
+ {
383
+ "current": 2650.0,
384
+ "min": 0.0,
385
+ "max": 0.0
386
+ },
387
+ {
388
+ "current": 2650.0,
389
+ "min": 0.0,
390
+ "max": 0.0
391
+ },
392
+ {
393
+ "current": 2650.0,
394
+ "min": 0.0,
395
+ "max": 0.0
396
+ },
397
+ {
398
+ "current": 2650.0,
399
+ "min": 0.0,
400
+ "max": 0.0
401
+ },
402
+ {
403
+ "current": 2650.0,
404
+ "min": 0.0,
405
+ "max": 0.0
406
+ },
407
+ {
408
+ "current": 2650.0,
409
+ "min": 0.0,
410
+ "max": 0.0
411
+ },
412
+ {
413
+ "current": 2650.0,
414
+ "min": 0.0,
415
+ "max": 0.0
416
+ },
417
+ {
418
+ "current": 2650.0,
419
+ "min": 0.0,
420
+ "max": 0.0
421
+ },
422
+ {
423
+ "current": 2650.0,
424
+ "min": 0.0,
425
+ "max": 0.0
426
+ },
427
+ {
428
+ "current": 2650.0,
429
+ "min": 0.0,
430
+ "max": 0.0
431
+ },
432
+ {
433
+ "current": 2650.0,
434
+ "min": 0.0,
435
+ "max": 0.0
436
+ },
437
+ {
438
+ "current": 2650.0,
439
+ "min": 0.0,
440
+ "max": 0.0
441
+ },
442
+ {
443
+ "current": 2650.0,
444
+ "min": 0.0,
445
+ "max": 0.0
446
+ },
447
+ {
448
+ "current": 2650.0,
449
+ "min": 0.0,
450
+ "max": 0.0
451
+ },
452
+ {
453
+ "current": 2650.0,
454
+ "min": 0.0,
455
+ "max": 0.0
456
+ },
457
+ {
458
+ "current": 2650.0,
459
+ "min": 0.0,
460
+ "max": 0.0
461
+ },
462
+ {
463
+ "current": 2650.0,
464
+ "min": 0.0,
465
+ "max": 0.0
466
+ },
467
+ {
468
+ "current": 2650.0,
469
+ "min": 0.0,
470
+ "max": 0.0
471
+ },
472
+ {
473
+ "current": 2650.0,
474
+ "min": 0.0,
475
+ "max": 0.0
476
+ },
477
+ {
478
+ "current": 2650.0,
479
+ "min": 0.0,
480
+ "max": 0.0
481
+ },
482
+ {
483
+ "current": 2650.0,
484
+ "min": 0.0,
485
+ "max": 0.0
486
+ },
487
+ {
488
+ "current": 2650.0,
489
+ "min": 0.0,
490
+ "max": 0.0
491
+ },
492
+ {
493
+ "current": 2650.0,
494
+ "min": 0.0,
495
+ "max": 0.0
496
+ },
497
+ {
498
+ "current": 2650.0,
499
+ "min": 0.0,
500
+ "max": 0.0
501
+ },
502
+ {
503
+ "current": 2650.0,
504
+ "min": 0.0,
505
+ "max": 0.0
506
+ },
507
+ {
508
+ "current": 2650.0,
509
+ "min": 0.0,
510
+ "max": 0.0
511
+ }
512
+ ],
513
+ "disk": {
514
+ "/": {
515
+ "total": 290.7472343444824,
516
+ "used": 59.263893127441406
517
+ }
518
+ },
519
+ "gpu": "NVIDIA H100 80GB HBM3",
520
+ "gpu_count": 8,
521
+ "gpu_devices": [
522
+ {
523
+ "name": "NVIDIA H100 80GB HBM3",
524
+ "memory_total": 85520809984
525
+ },
526
+ {
527
+ "name": "NVIDIA H100 80GB HBM3",
528
+ "memory_total": 85520809984
529
+ },
530
+ {
531
+ "name": "NVIDIA H100 80GB HBM3",
532
+ "memory_total": 85520809984
533
+ },
534
+ {
535
+ "name": "NVIDIA H100 80GB HBM3",
536
+ "memory_total": 85520809984
537
+ },
538
+ {
539
+ "name": "NVIDIA H100 80GB HBM3",
540
+ "memory_total": 85520809984
541
+ },
542
+ {
543
+ "name": "NVIDIA H100 80GB HBM3",
544
+ "memory_total": 85520809984
545
+ },
546
+ {
547
+ "name": "NVIDIA H100 80GB HBM3",
548
+ "memory_total": 85520809984
549
+ },
550
+ {
551
+ "name": "NVIDIA H100 80GB HBM3",
552
+ "memory_total": 85520809984
553
+ }
554
+ ],
555
+ "memory": {
556
+ "total": 1999.9855270385742
557
+ }
558
+ }
wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 14.0246, "train/grad_norm": 1440.0, "train/learning_rate": 2.0000000000000002e-07, "train/epoch": 0.0, "train/global_step": 1, "_timestamp": 1713973432.7827635, "_runtime": 13.312107563018799, "_step": 0, "_wandb": {"runtime": 14}}
wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-24 15:43:39,468 INFO StreamThr :1848599 [internal.py:wandb_internal():86] W&B internal server running at pid: 1848599, started at: 2024-04-24 15:43:39.467078
2
+ 2024-04-24 15:43:39,469 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status
3
+ 2024-04-24 15:43:39,473 INFO WriterThread:1848599 [datastore.py:open_for_write():85] open: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb
4
+ 2024-04-24 15:43:39,476 DEBUG SenderThread:1848599 [sender.py:send():382] send: header
5
+ 2024-04-24 15:43:39,521 DEBUG SenderThread:1848599 [sender.py:send():382] send: run
6
+ 2024-04-24 15:43:39,793 INFO SenderThread:1848599 [dir_watcher.py:__init__():211] watching files in: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files
7
+ 2024-04-24 15:43:39,793 INFO SenderThread:1848599 [sender.py:_start_run_threads():1136] run started: mwp0iutr with start time 1713973419.470656
8
+ 2024-04-24 15:43:39,798 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: check_version
9
+ 2024-04-24 15:43:39,799 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: check_version
10
+ 2024-04-24 15:43:39,851 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: run_start
11
+ 2024-04-24 15:43:39,908 DEBUG HandlerThread:1848599 [system_info.py:__init__():32] System info init
12
+ 2024-04-24 15:43:39,908 DEBUG HandlerThread:1848599 [system_info.py:__init__():47] System info init done
13
+ 2024-04-24 15:43:39,908 INFO HandlerThread:1848599 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-04-24 15:43:39,908 INFO SystemMonitor:1848599 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-04-24 15:43:39,909 INFO HandlerThread:1848599 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-04-24 15:43:39,909 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started cpu monitoring
17
+ 2024-04-24 15:43:39,909 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started disk monitoring
18
+ 2024-04-24 15:43:39,910 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started gpu monitoring
19
+ 2024-04-24 15:43:39,911 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started memory monitoring
20
+ 2024-04-24 15:43:39,911 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started network monitoring
21
+ 2024-04-24 15:43:39,965 DEBUG HandlerThread:1848599 [system_info.py:probe():196] Probing system
22
+ 2024-04-24 15:43:39,967 DEBUG HandlerThread:1848599 [system_info.py:_probe_git():181] Probing git
23
+ 2024-04-24 15:43:39,987 DEBUG HandlerThread:1848599 [system_info.py:_probe_git():189] Probing git done
24
+ 2024-04-24 15:43:39,987 DEBUG HandlerThread:1848599 [system_info.py:probe():244] Probing system done
25
+ 2024-04-24 15:43:39,987 DEBUG HandlerThread:1848599 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-1048-aws-x86_64-with-glibc2.31', 'python': '3.11.5', 'heartbeatAt': '2024-04-24T15:43:39.965097', 'startedAt': '2024-04-24T15:43:39.449266', 'docker': None, 'cuda': None, 'args': ('config_full.yaml',), 'state': 'running', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'codePathLocal': 'run_sft.py', 'codePath': 'run_sft.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat', 'commit': 'cbea69c6b95c970317a1e47c3f614b55b33f8ed9'}, 'email': None, 'root': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat', 'host': 'ip-26-0-162-233', 'username': 'sanchit', 'executable': '/fsx/sanchit/miniconda3/envs/venv/bin/python', 'cpu_count': 96, 'cpu_count_logical': 96, 'cpu_freq': {'current': 2721.9698645833337, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 3590.538, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3595.996, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.59, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3399.936, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.273, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.284, 'min': 0.0, 'max': 0.0}, {'current': 3036.337, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.887, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.442, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 290.7472343444824, 'used': 59.263893127441406}}, 'gpu': 'NVIDIA H100 80GB HBM3', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}], 'memory': {'total': 1999.9855270385742}}
26
+ 2024-04-24 15:43:39,988 INFO HandlerThread:1848599 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-04-24 15:43:39,988 INFO HandlerThread:1848599 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-04-24 15:43:39,988 DEBUG HandlerThread:1848599 [system_info.py:_save_pip():52] Saving list of pip packages installed into the current environment
29
+ 2024-04-24 15:43:39,989 DEBUG HandlerThread:1848599 [system_info.py:_save_pip():68] Saving pip packages done
30
+ 2024-04-24 15:43:39,990 DEBUG HandlerThread:1848599 [system_info.py:_save_conda():75] Saving list of conda packages installed into the current environment
31
+ 2024-04-24 15:43:40,795 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
32
+ 2024-04-24 15:43:40,796 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt
33
+ 2024-04-24 15:43:45,799 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
34
+ 2024-04-24 15:43:45,805 DEBUG HandlerThread:1848599 [system_info.py:_save_conda():87] Saving conda packages done
35
+ 2024-04-24 15:43:45,807 INFO HandlerThread:1848599 [system_monitor.py:probe():229] Finished publishing system info
36
+ 2024-04-24 15:43:45,857 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
37
+ 2024-04-24 15:43:45,857 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: keepalive
38
+ 2024-04-24 15:43:45,858 DEBUG SenderThread:1848599 [sender.py:send():382] send: files
39
+ 2024-04-24 15:43:45,858 INFO SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-metadata.json with policy now
40
+ 2024-04-24 15:43:45,864 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: stop_status
41
+ 2024-04-24 15:43:45,865 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: stop_status
42
+ 2024-04-24 15:43:45,867 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: internal_messages
43
+ 2024-04-24 15:43:45,993 DEBUG SenderThread:1848599 [sender.py:send():382] send: telemetry
44
+ 2024-04-24 15:43:45,993 DEBUG SenderThread:1848599 [sender.py:send():382] send: config
45
+ 2024-04-24 15:43:45,993 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
46
+ 2024-04-24 15:43:45,994 DEBUG SenderThread:1848599 [sender.py:send():382] send: telemetry
47
+ 2024-04-24 15:43:45,994 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
48
+ 2024-04-24 15:43:45,994 WARNING SenderThread:1848599 [sender.py:send_metric():1343] Seen metric with glob (shouldn't happen)
49
+ 2024-04-24 15:43:45,994 DEBUG SenderThread:1848599 [sender.py:send():382] send: telemetry
50
+ 2024-04-24 15:43:46,179 INFO wandb-upload_0:1848599 [upload_job.py:push():131] Uploaded file /tmp/tmphsb5r9cdwandb/sgr8lmob-wandb-metadata.json
51
+ 2024-04-24 15:43:46,800 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json
52
+ 2024-04-24 15:43:46,801 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
53
+ 2024-04-24 15:43:48,803 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
54
+ 2024-04-24 15:43:50,251 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
55
+ 2024-04-24 15:43:52,783 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: partial_history
56
+ 2024-04-24 15:43:52,785 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
57
+ 2024-04-24 15:43:52,785 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
58
+ 2024-04-24 15:43:52,786 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
59
+ 2024-04-24 15:43:52,786 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
60
+ 2024-04-24 15:43:52,786 DEBUG SenderThread:1848599 [sender.py:send():382] send: history
61
+ 2024-04-24 15:43:52,786 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: summary_record
62
+ 2024-04-24 15:43:52,788 INFO SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
63
+ 2024-04-24 15:43:52,807 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
64
+ 2024-04-24 15:43:54,212 DEBUG SenderThread:1848599 [sender.py:send():382] send: exit
65
+ 2024-04-24 15:43:54,212 INFO SenderThread:1848599 [sender.py:send_exit():589] handling exit code: 1
66
+ 2024-04-24 15:43:54,212 INFO SenderThread:1848599 [sender.py:send_exit():591] handling runtime: 14
67
+ 2024-04-24 15:43:54,213 INFO SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
68
+ 2024-04-24 15:43:54,213 INFO SenderThread:1848599 [sender.py:send_exit():597] send defer
69
+ 2024-04-24 15:43:54,213 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
70
+ 2024-04-24 15:43:54,213 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 0
71
+ 2024-04-24 15:43:54,214 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
72
+ 2024-04-24 15:43:54,214 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 0
73
+ 2024-04-24 15:43:54,214 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 1
74
+ 2024-04-24 15:43:54,214 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
75
+ 2024-04-24 15:43:54,214 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 1
76
+ 2024-04-24 15:43:54,214 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
77
+ 2024-04-24 15:43:54,214 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 1
78
+ 2024-04-24 15:43:54,214 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 2
79
+ 2024-04-24 15:43:54,214 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
80
+ 2024-04-24 15:43:54,214 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 2
81
+ 2024-04-24 15:43:54,214 INFO HandlerThread:1848599 [system_monitor.py:finish():203] Stopping system monitor
82
+ 2024-04-24 15:43:54,214 DEBUG SystemMonitor:1848599 [system_monitor.py:_start():172] Starting system metrics aggregation loop
83
+ 2024-04-24 15:43:54,215 DEBUG SystemMonitor:1848599 [system_monitor.py:_start():179] Finished system metrics aggregation loop
84
+ 2024-04-24 15:43:54,215 DEBUG SystemMonitor:1848599 [system_monitor.py:_start():183] Publishing last batch of metrics
85
+ 2024-04-24 15:43:54,215 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined cpu monitor
86
+ 2024-04-24 15:43:54,217 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined disk monitor
87
+ 2024-04-24 15:43:54,810 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
88
+ 2024-04-24 15:43:54,810 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
89
+ 2024-04-24 15:43:56,812 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
90
+ 2024-04-24 15:43:57,141 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined gpu monitor
91
+ 2024-04-24 15:43:57,142 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined memory monitor
92
+ 2024-04-24 15:43:57,142 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined network monitor
93
+ 2024-04-24 15:43:57,142 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: poll_exit
94
+ 2024-04-24 15:43:57,143 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
95
+ 2024-04-24 15:43:57,143 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
96
+ 2024-04-24 15:43:57,143 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 2
97
+ 2024-04-24 15:43:57,143 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 3
98
+ 2024-04-24 15:43:57,144 DEBUG SenderThread:1848599 [sender.py:send():382] send: stats
99
+ 2024-04-24 15:43:57,144 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
100
+ 2024-04-24 15:43:57,144 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: poll_exit
101
+ 2024-04-24 15:43:57,145 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 3
102
+ 2024-04-24 15:43:57,146 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
103
+ 2024-04-24 15:43:57,146 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 3
104
+ 2024-04-24 15:43:57,146 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 4
105
+ 2024-04-24 15:43:57,146 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
106
+ 2024-04-24 15:43:57,146 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 4
107
+ 2024-04-24 15:43:57,147 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
108
+ 2024-04-24 15:43:57,147 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 4
109
+ 2024-04-24 15:43:57,147 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 5
110
+ 2024-04-24 15:43:57,147 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
111
+ 2024-04-24 15:43:57,147 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 5
112
+ 2024-04-24 15:43:57,147 DEBUG SenderThread:1848599 [sender.py:send():382] send: summary
113
+ 2024-04-24 15:43:57,149 INFO SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
114
+ 2024-04-24 15:43:57,149 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
115
+ 2024-04-24 15:43:57,149 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 5
116
+ 2024-04-24 15:43:57,149 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 6
117
+ 2024-04-24 15:43:57,149 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
118
+ 2024-04-24 15:43:57,149 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 6
119
+ 2024-04-24 15:43:57,149 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
120
+ 2024-04-24 15:43:57,149 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 6
121
+ 2024-04-24 15:43:57,152 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
122
+ 2024-04-24 15:43:57,275 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 7
123
+ 2024-04-24 15:43:57,275 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
124
+ 2024-04-24 15:43:57,275 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 7
125
+ 2024-04-24 15:43:57,275 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
126
+ 2024-04-24 15:43:57,275 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 7
127
+ 2024-04-24 15:43:57,814 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml
128
+ 2024-04-24 15:43:57,814 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
129
+ 2024-04-24 15:43:58,791 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 8
130
+ 2024-04-24 15:43:58,792 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
131
+ 2024-04-24 15:43:58,792 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 8
132
+ 2024-04-24 15:43:58,792 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
133
+ 2024-04-24 15:43:58,792 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 8
134
+ 2024-04-24 15:43:58,792 INFO SenderThread:1848599 [job_builder.py:build():298] Attempting to build job artifact
135
+ 2024-04-24 15:43:58,794 INFO SenderThread:1848599 [job_builder.py:_get_source_type():428] is repo sourced job
136
+ 2024-04-24 15:43:58,815 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
137
+ 2024-04-24 15:43:58,832 INFO SenderThread:1848599 [job_builder.py:build():404] adding wandb-job metadata file
138
+ 2024-04-24 15:43:58,858 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 9
139
+ 2024-04-24 15:43:58,859 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
140
+ 2024-04-24 15:43:58,859 DEBUG SenderThread:1848599 [sender.py:send():382] send: artifact
141
+ 2024-04-24 15:43:58,859 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 9
142
+ 2024-04-24 15:43:59,524 INFO wandb-upload_0:1848599 [upload_job.py:push():89] Uploaded file /admin/home/sanchit/.local/share/wandb/artifacts/staging/tmp1vajxumh
143
+ 2024-04-24 15:43:59,530 INFO wandb-upload_1:1848599 [upload_job.py:push():89] Uploaded file /admin/home/sanchit/.local/share/wandb/artifacts/staging/tmp824ipvc5
144
+ 2024-04-24 15:44:00,093 INFO SenderThread:1848599 [sender.py:send_artifact():1470] sent artifact job-https___huggingface.co_sanchit-gandhi_distil-zephyr-1.5b-ssft-ultrachat_run_sft.py - {'id': 'QXJ0aWZhY3Q6ODA4NTQyNDIx', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjE2NjI0NzU4Nw==', 'latestArtifact': None}}
145
+ 2024-04-24 15:44:00,093 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
146
+ 2024-04-24 15:44:00,093 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 9
147
+ 2024-04-24 15:44:00,093 INFO SenderThread:1848599 [dir_watcher.py:finish():358] shutting down directory watcher
148
+ 2024-04-24 15:44:00,213 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: keepalive
149
+ 2024-04-24 15:44:00,816 INFO SenderThread:1848599 [dir_watcher.py:finish():388] scan: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files
150
+ 2024-04-24 15:44:00,817 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml conda-environment.yaml
151
+ 2024-04-24 15:44:00,817 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json wandb-summary.json
152
+ 2024-04-24 15:44:00,817 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log output.log
153
+ 2024-04-24 15:44:00,821 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml config.yaml
154
+ 2024-04-24 15:44:00,824 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt requirements.txt
155
+ 2024-04-24 15:44:00,826 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json wandb-metadata.json
156
+ 2024-04-24 15:44:00,826 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 10
157
+ 2024-04-24 15:44:00,828 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
158
+ 2024-04-24 15:44:00,828 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 10
159
+ 2024-04-24 15:44:00,828 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
160
+ 2024-04-24 15:44:00,828 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 10
161
+ 2024-04-24 15:44:00,828 INFO SenderThread:1848599 [file_pusher.py:finish():175] shutting down file pusher
162
+ 2024-04-24 15:44:01,006 INFO wandb-upload_0:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
163
+ 2024-04-24 15:44:01,059 INFO wandb-upload_1:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
164
+ 2024-04-24 15:44:01,161 INFO wandb-upload_2:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
165
+ 2024-04-24 15:44:01,169 INFO wandb-upload_3:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml
166
+ 2024-04-24 15:44:01,184 INFO wandb-upload_4:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt
167
+ 2024-04-24 15:44:01,384 INFO Thread-11 (_thread_body):1848599 [sender.py:transition_state():617] send defer: 11
168
+ 2024-04-24 15:44:01,385 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
169
+ 2024-04-24 15:44:01,385 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 11
170
+ 2024-04-24 15:44:01,385 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
171
+ 2024-04-24 15:44:01,385 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 11
172
+ 2024-04-24 15:44:01,385 INFO SenderThread:1848599 [file_pusher.py:join():181] waiting for file pusher
173
+ 2024-04-24 15:44:01,385 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 12
174
+ 2024-04-24 15:44:01,385 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
175
+ 2024-04-24 15:44:01,385 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 12
176
+ 2024-04-24 15:44:01,385 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
177
+ 2024-04-24 15:44:01,385 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 12
178
+ 2024-04-24 15:44:01,386 INFO SenderThread:1848599 [file_stream.py:finish():595] file stream finish called
179
+ 2024-04-24 15:44:01,445 INFO SenderThread:1848599 [file_stream.py:finish():599] file stream finish is done
180
+ 2024-04-24 15:44:01,445 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 13
181
+ 2024-04-24 15:44:01,445 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
182
+ 2024-04-24 15:44:01,445 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 13
183
+ 2024-04-24 15:44:01,445 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
184
+ 2024-04-24 15:44:01,445 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 13
185
+ 2024-04-24 15:44:01,445 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 14
186
+ 2024-04-24 15:44:01,446 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
187
+ 2024-04-24 15:44:01,446 DEBUG SenderThread:1848599 [sender.py:send():382] send: final
188
+ 2024-04-24 15:44:01,446 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 14
189
+ 2024-04-24 15:44:01,446 DEBUG SenderThread:1848599 [sender.py:send():382] send: footer
190
+ 2024-04-24 15:44:01,446 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
191
+ 2024-04-24 15:44:01,446 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 14
192
+ 2024-04-24 15:44:01,447 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: poll_exit
193
+ 2024-04-24 15:44:01,447 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: poll_exit
194
+ 2024-04-24 15:44:01,447 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: server_info
195
+ 2024-04-24 15:44:01,447 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: get_summary
196
+ 2024-04-24 15:44:01,448 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: server_info
197
+ 2024-04-24 15:44:01,449 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: sampled_history
198
+ 2024-04-24 15:44:01,449 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: internal_messages
199
+ 2024-04-24 15:44:01,450 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: job_info
200
+ 2024-04-24 15:44:01,507 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: job_info
201
+ 2024-04-24 15:44:01,508 INFO MainThread:1848599 [wandb_run.py:_footer_history_summary_info():3837] rendering history
202
+ 2024-04-24 15:44:01,508 INFO MainThread:1848599 [wandb_run.py:_footer_history_summary_info():3869] rendering summary
203
+ 2024-04-24 15:44:01,508 INFO MainThread:1848599 [wandb_run.py:_footer_sync_info():3796] logging synced files
204
+ 2024-04-24 15:44:01,508 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: shutdown
205
+ 2024-04-24 15:44:01,508 INFO HandlerThread:1848599 [handler.py:finish():866] shutting down handler
206
+ 2024-04-24 15:44:02,450 INFO WriterThread:1848599 [datastore.py:close():294] close: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb
207
+ 2024-04-24 15:44:02,508 INFO SenderThread:1848599 [sender.py:finish():1548] shutting down sender
208
+ 2024-04-24 15:44:02,508 INFO SenderThread:1848599 [file_pusher.py:finish():175] shutting down file pusher
209
+ 2024-04-24 15:44:02,508 INFO SenderThread:1848599 [file_pusher.py:join():181] waiting for file pusher
wandb/run-20240424_154339-mwp0iutr/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
2
+ 2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Configure stats pid to 1840687
3
+ 2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
5
+ 2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
8
+ 2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/logs/debug.log
9
+ 2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log
10
+ 2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:init():564] calling init triggers
11
+ 2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
12
+ config: {}
13
+ 2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:init():614] starting backend
14
+ 2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:init():618] setting up manager
15
+ 2024-04-24 15:43:39,465 INFO MainThread:1840687 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-04-24 15:43:39,470 INFO MainThread:1840687 [wandb_init.py:init():624] backend started and connected
17
+ 2024-04-24 15:43:39,472 INFO MainThread:1840687 [wandb_init.py:init():716] updated telemetry
18
+ 2024-04-24 15:43:39,520 INFO MainThread:1840687 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
19
+ 2024-04-24 15:43:39,798 INFO MainThread:1840687 [wandb_run.py:_on_init():2254] communicating current version
20
+ 2024-04-24 15:43:39,844 INFO MainThread:1840687 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-04-24 15:43:39,844 INFO MainThread:1840687 [wandb_init.py:init():800] starting run threads in backend
23
+ 2024-04-24 15:43:45,864 INFO MainThread:1840687 [wandb_run.py:_console_start():2233] atexit reg
24
+ 2024-04-24 15:43:45,864 INFO MainThread:1840687 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
+ 2024-04-24 15:43:45,864 INFO MainThread:1840687 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
+ 2024-04-24 15:43:45,864 INFO MainThread:1840687 [wandb_run.py:_redirect():2178] Redirects installed.
27
+ 2024-04-24 15:43:45,866 INFO MainThread:1840687 [wandb_init.py:init():841] run started, returning control to user process
28
+ 2024-04-24 15:43:45,867 INFO MainThread:1840687 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 64, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_14-23-38_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
29
+ 2024-04-24 15:44:02,589 WARNING MsgRouterThr:1840687 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb ADDED
Binary file (26.4 kB). View file
 
wandb/run-20240424_164324-xfbnm7qo/files/conda-environment.yaml ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: venv
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - defaults
6
+ dependencies:
7
+ - _libgcc_mutex=0.1=main
8
+ - _openmp_mutex=5.1=1_gnu
9
+ - blas=1.0=mkl
10
+ - brotli-python=1.0.9=py311h6a678d5_7
11
+ - bzip2=1.0.8=h7b6447c_0
12
+ - ca-certificates=2023.12.12=h06a4308_0
13
+ - certifi=2023.11.17=py311h06a4308_0
14
+ - cffi=1.16.0=py311h5eee18b_0
15
+ - cryptography=41.0.7=py311hdda0065_0
16
+ - cuda-cudart=12.1.105=0
17
+ - cuda-cupti=12.1.105=0
18
+ - cuda-libraries=12.1.0=0
19
+ - cuda-nvrtc=12.1.105=0
20
+ - cuda-nvtx=12.1.105=0
21
+ - cuda-opencl=12.3.101=0
22
+ - cuda-runtime=12.1.0=0
23
+ - ffmpeg=4.3=hf484d3e_0
24
+ - filelock=3.13.1=py311h06a4308_0
25
+ - freetype=2.12.1=h4a9f257_0
26
+ - giflib=5.2.1=h5eee18b_3
27
+ - gmp=6.2.1=h295c915_3
28
+ - gmpy2=2.1.2=py311hc9b5ff0_0
29
+ - gnutls=3.6.15=he1e5248_0
30
+ - intel-openmp=2023.1.0=hdb19cb5_46306
31
+ - jinja2=3.1.2=py311h06a4308_0
32
+ - jpeg=9e=h5eee18b_1
33
+ - lame=3.100=h7b6447c_0
34
+ - lcms2=2.12=h3be6417_0
35
+ - ld_impl_linux-64=2.38=h1181459_1
36
+ - lerc=3.0=h295c915_0
37
+ - libcublas=12.1.0.26=0
38
+ - libcufft=11.0.2.4=0
39
+ - libcufile=1.8.1.2=0
40
+ - libcurand=10.3.4.101=0
41
+ - libcusolver=11.4.4.55=0
42
+ - libcusparse=12.0.2.55=0
43
+ - libdeflate=1.17=h5eee18b_1
44
+ - libffi=3.4.4=h6a678d5_0
45
+ - libgcc-ng=11.2.0=h1234567_1
46
+ - libgomp=11.2.0=h1234567_1
47
+ - libiconv=1.16=h7f8727e_2
48
+ - libidn2=2.3.4=h5eee18b_0
49
+ - libjpeg-turbo=2.0.0=h9bf148f_0
50
+ - libnpp=12.0.2.50=0
51
+ - libnvjitlink=12.1.105=0
52
+ - libnvjpeg=12.1.1.14=0
53
+ - libpng=1.6.39=h5eee18b_0
54
+ - libstdcxx-ng=11.2.0=h1234567_1
55
+ - libtasn1=4.19.0=h5eee18b_0
56
+ - libtiff=4.5.1=h6a678d5_0
57
+ - libunistring=0.9.10=h27cfd23_0
58
+ - libuuid=1.41.5=h5eee18b_0
59
+ - libwebp=1.3.2=h11a3e52_0
60
+ - libwebp-base=1.3.2=h5eee18b_0
61
+ - llvm-openmp=14.0.6=h9e868ea_0
62
+ - lz4-c=1.9.4=h6a678d5_0
63
+ - markupsafe=2.1.1=py311h5eee18b_0
64
+ - mkl=2023.1.0=h213fc3f_46344
65
+ - mkl-service=2.4.0=py311h5eee18b_1
66
+ - mkl_fft=1.3.8=py311h5eee18b_0
67
+ - mkl_random=1.2.4=py311hdb19cb5_0
68
+ - mpc=1.1.0=h10f8cd9_1
69
+ - mpfr=4.0.2=hb69a4c5_1
70
+ - mpmath=1.3.0=py311h06a4308_0
71
+ - ncurses=6.4=h6a678d5_0
72
+ - nettle=3.7.3=hbbd107a_1
73
+ - networkx=3.1=py311h06a4308_0
74
+ - numpy=1.26.2=py311h08b1b3b_0
75
+ - numpy-base=1.26.2=py311hf175353_0
76
+ - openh264=2.1.1=h4ff587b_0
77
+ - openjpeg=2.4.0=h3ad879b_0
78
+ - openssl=3.0.12=h7f8727e_0
79
+ - pycparser=2.21=pyhd3eb1b0_0
80
+ - pyopenssl=23.2.0=py311h06a4308_0
81
+ - pysocks=1.7.1=py311h06a4308_0
82
+ - python=3.11.5=h955ad1f_0
83
+ - pytorch-cuda=12.1=ha16c6d3_5
84
+ - pytorch-mutex=1.0=cuda
85
+ - pyyaml=6.0.1=py311h5eee18b_0
86
+ - readline=8.2=h5eee18b_0
87
+ - requests=2.31.0=py311h06a4308_0
88
+ - setuptools=68.2.2=py311h06a4308_0
89
+ - sqlite=3.41.2=h5eee18b_0
90
+ - sympy=1.12=py311h06a4308_0
91
+ - tbb=2021.8.0=hdb19cb5_0
92
+ - tk=8.6.12=h1ccaba5_0
93
+ - wheel=0.41.2=py311h06a4308_0
94
+ - xz=5.4.5=h5eee18b_0
95
+ - yaml=0.2.5=h7b6447c_0
96
+ - zlib=1.2.13=h5eee18b_0
97
+ - zstd=1.5.5=hc292b87_0
98
+ - pip:
99
+ - absl-py==2.0.0
100
+ - accelerate==0.29.3
101
+ - aiohttp==3.9.1
102
+ - aiosignal==1.3.1
103
+ - annotated-types==0.6.0
104
+ - anyio==4.2.0
105
+ - appdirs==1.4.4
106
+ - argon2-cffi==23.1.0
107
+ - argon2-cffi-bindings==21.2.0
108
+ - arrow==1.3.0
109
+ - asttokens==2.4.1
110
+ - astunparse==1.6.3
111
+ - async-lru==2.0.4
112
+ - attrs==23.1.0
113
+ - audioread==3.0.1
114
+ - babel==2.14.0
115
+ - beautifulsoup4==4.12.3
116
+ - bitsandbytes==0.43.1
117
+ - bleach==6.1.0
118
+ - cachetools==5.3.2
119
+ - chardet==5.2.0
120
+ - charset-normalizer==3.3.2
121
+ - click==8.1.7
122
+ - comm==0.2.1
123
+ - datasets==2.18.1.dev0
124
+ - debugpy==1.8.1
125
+ - decorator==5.1.1
126
+ - deepspeed==0.12.2
127
+ - defusedxml==0.7.1
128
+ - dill==0.3.7
129
+ - docker-pycreds==0.4.0
130
+ - docstring-parser==0.15
131
+ - einops==0.7.0
132
+ - evaluate==0.4.0
133
+ - executing==2.0.1
134
+ - fastjsonschema==2.19.1
135
+ - flatbuffers==23.5.26
136
+ - fqdn==1.5.1
137
+ - frozenlist==1.4.1
138
+ - fsspec==2023.10.0
139
+ - gast==0.5.4
140
+ - gitdb==4.0.11
141
+ - gitpython==3.1.40
142
+ - google-auth==2.26.1
143
+ - google-auth-oauthlib==1.2.0
144
+ - google-pasta==0.2.0
145
+ - grpcio==1.60.0
146
+ - h11==0.14.0
147
+ - h5py==3.10.0
148
+ - hf-transfer==0.1.5
149
+ - hjson==3.1.0
150
+ - httpcore==1.0.2
151
+ - httpx==0.26.0
152
+ - huggingface-hub==0.22.2
153
+ - idna==3.6
154
+ - ipdb==0.13.13
155
+ - ipykernel==6.29.2
156
+ - ipython==8.21.0
157
+ - isoduration==20.11.0
158
+ - jedi==0.19.1
159
+ - jiwer==3.0.3
160
+ - joblib==1.3.2
161
+ - json5==0.9.14
162
+ - jsonpointer==2.4
163
+ - jsonschema==4.21.1
164
+ - jsonschema-specifications==2023.12.1
165
+ - jupyter-client==8.6.0
166
+ - jupyter-core==5.7.1
167
+ - jupyter-events==0.9.0
168
+ - jupyter-lsp==2.2.2
169
+ - jupyter-server==2.12.5
170
+ - jupyter-server-terminals==0.5.2
171
+ - jupyterlab==4.1.1
172
+ - jupyterlab-pygments==0.3.0
173
+ - jupyterlab-server==2.25.2
174
+ - keras==2.15.0
175
+ - lazy-loader==0.3
176
+ - libclang==16.0.6
177
+ - librosa==0.10.1
178
+ - llvmlite==0.41.1
179
+ - markdown==3.5.1
180
+ - markdown-it-py==3.0.0
181
+ - matplotlib-inline==0.1.6
182
+ - mdurl==0.1.2
183
+ - mistune==3.0.2
184
+ - ml-dtypes==0.2.0
185
+ - msgpack==1.0.7
186
+ - multidict==6.0.4
187
+ - multiprocess==0.70.15
188
+ - nbclient==0.9.0
189
+ - nbconvert==7.16.0
190
+ - nbformat==5.9.2
191
+ - nest-asyncio==1.6.0
192
+ - ninja==1.11.1.1
193
+ - nltk==3.8.1
194
+ - notebook-shim==0.2.3
195
+ - numba==0.58.1
196
+ - nvidia-cublas-cu12==12.1.3.1
197
+ - nvidia-cuda-cupti-cu12==12.1.105
198
+ - nvidia-cuda-nvrtc-cu12==12.1.105
199
+ - nvidia-cuda-runtime-cu12==12.1.105
200
+ - nvidia-cudnn-cu12==8.9.2.26
201
+ - nvidia-cufft-cu12==11.0.2.54
202
+ - nvidia-curand-cu12==10.3.2.106
203
+ - nvidia-cusolver-cu12==11.4.5.107
204
+ - nvidia-cusparse-cu12==12.1.0.106
205
+ - nvidia-nccl-cu12==2.20.5
206
+ - nvidia-nvjitlink-cu12==12.3.101
207
+ - nvidia-nvtx-cu12==12.1.105
208
+ - oauthlib==3.2.2
209
+ - opt-einsum==3.3.0
210
+ - overrides==7.7.0
211
+ - packaging==23.2
212
+ - pandas==2.1.4
213
+ - pandocfilters==1.5.1
214
+ - parso==0.8.3
215
+ - peft==0.7.1
216
+ - pexpect==4.9.0
217
+ - pillow==10.2.0
218
+ - pip==24.0
219
+ - platformdirs==4.1.0
220
+ - pooch==1.8.0
221
+ - prometheus-client==0.19.0
222
+ - prompt-toolkit==3.0.43
223
+ - protobuf==3.20.2
224
+ - psutil==5.9.7
225
+ - ptyprocess==0.7.0
226
+ - pure-eval==0.2.2
227
+ - py-cpuinfo==9.0.0
228
+ - pyarrow==14.0.2
229
+ - pyarrow-hotfix==0.6
230
+ - pyasn1==0.5.1
231
+ - pyasn1-modules==0.3.0
232
+ - pydantic==2.6.0
233
+ - pydantic-core==2.16.1
234
+ - pygments==2.17.2
235
+ - pynvml==11.5.0
236
+ - python-dateutil==2.8.2
237
+ - python-json-logger==2.0.7
238
+ - pytorch-triton==3.0.0+989adb9a29
239
+ - pytz==2023.3.post1
240
+ - pyzmq==25.1.2
241
+ - rapidfuzz==3.6.1
242
+ - referencing==0.33.0
243
+ - regex==2023.12.25
244
+ - requests-oauthlib==1.3.1
245
+ - responses==0.18.0
246
+ - rfc3339-validator==0.1.4
247
+ - rfc3986-validator==0.1.1
248
+ - rich==13.7.0
249
+ - rpds-py==0.17.1
250
+ - rsa==4.9
251
+ - safetensors==0.4.1
252
+ - scikit-learn==1.3.2
253
+ - scipy==1.11.4
254
+ - send2trash==1.8.2
255
+ - sentencepiece==0.1.99
256
+ - sentry-sdk==1.39.1
257
+ - setproctitle==1.3.3
258
+ - shtab==1.6.5
259
+ - six==1.16.0
260
+ - smmap==5.0.1
261
+ - sniffio==1.3.0
262
+ - soundfile==0.12.1
263
+ - soupsieve==2.5
264
+ - soxr==0.3.7
265
+ - stack-data==0.6.3
266
+ - tensorboard==2.15.1
267
+ - tensorboard-data-server==0.7.2
268
+ - tensorflow-cpu==2.15.0.post1
269
+ - tensorflow-estimator==2.15.0
270
+ - tensorflow-io-gcs-filesystem==0.35.0
271
+ - termcolor==2.4.0
272
+ - terminado==0.18.0
273
+ - threadpoolctl==3.2.0
274
+ - tinycss2==1.2.1
275
+ - tokenizers==0.15.0
276
+ - torch==2.4.0.dev20240323+cu121
277
+ - torchaudio==2.2.0.dev20240323+cu121
278
+ - torchvision==0.19.0.dev20240323+cu121
279
+ - tornado==6.4
280
+ - tqdm==4.66.1
281
+ - traitlets==5.14.1
282
+ - transformers==4.39.0.dev0
283
+ - triton==2.2.0
284
+ - trl==0.8.6
285
+ - types-python-dateutil==2.8.19.20240106
286
+ - typing-extensions==4.9.0
287
+ - tyro==0.7.0
288
+ - tzdata==2023.3
289
+ - uri-template==1.3.0
290
+ - urllib3==2.1.0
291
+ - wandb==0.16.1
292
+ - wcwidth==0.2.13
293
+ - webcolors==1.13
294
+ - webencodings==0.5.1
295
+ - websocket-client==1.7.0
296
+ - werkzeug==3.0.1
297
+ - wrapt==1.14.1
298
+ - xxhash==3.4.1
299
+ - yarl==1.9.4
300
+ prefix: /fsx/sanchit/miniconda3/envs/venv
wandb/run-20240424_164324-xfbnm7qo/files/config.yaml ADDED
@@ -0,0 +1,663 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.11.5
7
+ cli_version: 0.16.1
8
+ framework: huggingface
9
+ huggingface_version: 4.40.0.dev0
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1713977004.542006
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 2
17
+ - 3
18
+ - 5
19
+ - 11
20
+ - 49
21
+ - 51
22
+ - 53
23
+ - 55
24
+ - 71
25
+ - 84
26
+ - 98
27
+ 2:
28
+ - 1
29
+ - 2
30
+ - 3
31
+ - 5
32
+ - 11
33
+ - 49
34
+ - 51
35
+ - 53
36
+ - 55
37
+ - 71
38
+ - 84
39
+ - 98
40
+ 3:
41
+ - 7
42
+ - 23
43
+ 4: 3.11.5
44
+ 5: 0.16.1
45
+ 6: 4.40.0.dev0
46
+ 8:
47
+ - 5
48
+ 9:
49
+ 1: transformers_trainer
50
+ 13: linux-x86_64
51
+ m:
52
+ - 1: train/global_step
53
+ 6:
54
+ - 3
55
+ - 1: train/loss
56
+ 5: 1
57
+ 6:
58
+ - 1
59
+ - 1: train/grad_norm
60
+ 5: 1
61
+ 6:
62
+ - 1
63
+ - 1: train/learning_rate
64
+ 5: 1
65
+ 6:
66
+ - 1
67
+ - 1: train/epoch
68
+ 5: 1
69
+ 6:
70
+ - 1
71
+ vocab_size:
72
+ desc: null
73
+ value: 32000
74
+ max_position_embeddings:
75
+ desc: null
76
+ value: 32768
77
+ hidden_size:
78
+ desc: null
79
+ value: 4096
80
+ intermediate_size:
81
+ desc: null
82
+ value: 14336
83
+ num_hidden_layers:
84
+ desc: null
85
+ value: 6
86
+ num_attention_heads:
87
+ desc: null
88
+ value: 32
89
+ sliding_window:
90
+ desc: null
91
+ value: 4096
92
+ num_key_value_heads:
93
+ desc: null
94
+ value: 8
95
+ hidden_act:
96
+ desc: null
97
+ value: silu
98
+ initializer_range:
99
+ desc: null
100
+ value: 0.02
101
+ rms_norm_eps:
102
+ desc: null
103
+ value: 1.0e-05
104
+ use_cache:
105
+ desc: null
106
+ value: false
107
+ rope_theta:
108
+ desc: null
109
+ value: 10000.0
110
+ attention_dropout:
111
+ desc: null
112
+ value: 0.0
113
+ return_dict:
114
+ desc: null
115
+ value: true
116
+ output_hidden_states:
117
+ desc: null
118
+ value: false
119
+ output_attentions:
120
+ desc: null
121
+ value: false
122
+ torchscript:
123
+ desc: null
124
+ value: false
125
+ torch_dtype:
126
+ desc: null
127
+ value: bfloat16
128
+ use_bfloat16:
129
+ desc: null
130
+ value: false
131
+ tf_legacy_loss:
132
+ desc: null
133
+ value: false
134
+ pruned_heads:
135
+ desc: null
136
+ value: {}
137
+ tie_word_embeddings:
138
+ desc: null
139
+ value: false
140
+ chunk_size_feed_forward:
141
+ desc: null
142
+ value: 0
143
+ is_encoder_decoder:
144
+ desc: null
145
+ value: false
146
+ is_decoder:
147
+ desc: null
148
+ value: false
149
+ cross_attention_hidden_size:
150
+ desc: null
151
+ value: null
152
+ add_cross_attention:
153
+ desc: null
154
+ value: false
155
+ tie_encoder_decoder:
156
+ desc: null
157
+ value: false
158
+ max_length:
159
+ desc: null
160
+ value: 20
161
+ min_length:
162
+ desc: null
163
+ value: 0
164
+ do_sample:
165
+ desc: null
166
+ value: false
167
+ early_stopping:
168
+ desc: null
169
+ value: false
170
+ num_beams:
171
+ desc: null
172
+ value: 1
173
+ num_beam_groups:
174
+ desc: null
175
+ value: 1
176
+ diversity_penalty:
177
+ desc: null
178
+ value: 0.0
179
+ temperature:
180
+ desc: null
181
+ value: 1.0
182
+ top_k:
183
+ desc: null
184
+ value: 50
185
+ top_p:
186
+ desc: null
187
+ value: 1.0
188
+ typical_p:
189
+ desc: null
190
+ value: 1.0
191
+ repetition_penalty:
192
+ desc: null
193
+ value: 1.0
194
+ length_penalty:
195
+ desc: null
196
+ value: 1.0
197
+ no_repeat_ngram_size:
198
+ desc: null
199
+ value: 0
200
+ encoder_no_repeat_ngram_size:
201
+ desc: null
202
+ value: 0
203
+ bad_words_ids:
204
+ desc: null
205
+ value: null
206
+ num_return_sequences:
207
+ desc: null
208
+ value: 1
209
+ output_scores:
210
+ desc: null
211
+ value: false
212
+ return_dict_in_generate:
213
+ desc: null
214
+ value: false
215
+ forced_bos_token_id:
216
+ desc: null
217
+ value: null
218
+ forced_eos_token_id:
219
+ desc: null
220
+ value: null
221
+ remove_invalid_values:
222
+ desc: null
223
+ value: false
224
+ exponential_decay_length_penalty:
225
+ desc: null
226
+ value: null
227
+ suppress_tokens:
228
+ desc: null
229
+ value: null
230
+ begin_suppress_tokens:
231
+ desc: null
232
+ value: null
233
+ architectures:
234
+ desc: null
235
+ value:
236
+ - MistralForCausalLM
237
+ finetuning_task:
238
+ desc: null
239
+ value: null
240
+ id2label:
241
+ desc: null
242
+ value:
243
+ '0': LABEL_0
244
+ '1': LABEL_1
245
+ label2id:
246
+ desc: null
247
+ value:
248
+ LABEL_0: 0
249
+ LABEL_1: 1
250
+ tokenizer_class:
251
+ desc: null
252
+ value: null
253
+ prefix:
254
+ desc: null
255
+ value: null
256
+ bos_token_id:
257
+ desc: null
258
+ value: 1
259
+ pad_token_id:
260
+ desc: null
261
+ value: null
262
+ eos_token_id:
263
+ desc: null
264
+ value: 2
265
+ sep_token_id:
266
+ desc: null
267
+ value: null
268
+ decoder_start_token_id:
269
+ desc: null
270
+ value: null
271
+ task_specific_params:
272
+ desc: null
273
+ value: null
274
+ problem_type:
275
+ desc: null
276
+ value: null
277
+ _name_or_path:
278
+ desc: null
279
+ value: sanchit-gandhi/Mistral-7B-v0.1-6-layer
280
+ transformers_version:
281
+ desc: null
282
+ value: 4.40.0.dev0
283
+ model_type:
284
+ desc: null
285
+ value: mistral
286
+ output_dir:
287
+ desc: null
288
+ value: ./
289
+ overwrite_output_dir:
290
+ desc: null
291
+ value: true
292
+ do_train:
293
+ desc: null
294
+ value: false
295
+ do_eval:
296
+ desc: null
297
+ value: true
298
+ do_predict:
299
+ desc: null
300
+ value: false
301
+ evaluation_strategy:
302
+ desc: null
303
+ value: steps
304
+ prediction_loss_only:
305
+ desc: null
306
+ value: false
307
+ per_device_train_batch_size:
308
+ desc: null
309
+ value: 32
310
+ per_device_eval_batch_size:
311
+ desc: null
312
+ value: 32
313
+ per_gpu_train_batch_size:
314
+ desc: null
315
+ value: null
316
+ per_gpu_eval_batch_size:
317
+ desc: null
318
+ value: null
319
+ gradient_accumulation_steps:
320
+ desc: null
321
+ value: 1
322
+ eval_accumulation_steps:
323
+ desc: null
324
+ value: null
325
+ eval_delay:
326
+ desc: null
327
+ value: 0
328
+ learning_rate:
329
+ desc: null
330
+ value: 0.0001
331
+ weight_decay:
332
+ desc: null
333
+ value: 0.0
334
+ adam_beta1:
335
+ desc: null
336
+ value: 0.9
337
+ adam_beta2:
338
+ desc: null
339
+ value: 0.999
340
+ adam_epsilon:
341
+ desc: null
342
+ value: 1.0e-08
343
+ max_grad_norm:
344
+ desc: null
345
+ value: 1.0
346
+ num_train_epochs:
347
+ desc: null
348
+ value: 3.0
349
+ max_steps:
350
+ desc: null
351
+ value: 20000
352
+ lr_scheduler_type:
353
+ desc: null
354
+ value: linear
355
+ lr_scheduler_kwargs:
356
+ desc: null
357
+ value: {}
358
+ warmup_ratio:
359
+ desc: null
360
+ value: 0.0
361
+ warmup_steps:
362
+ desc: null
363
+ value: 500
364
+ log_level:
365
+ desc: null
366
+ value: info
367
+ log_level_replica:
368
+ desc: null
369
+ value: warning
370
+ log_on_each_node:
371
+ desc: null
372
+ value: true
373
+ logging_dir:
374
+ desc: null
375
+ value: ./runs/Apr24_16-42-31_ip-26-0-162-233
376
+ logging_strategy:
377
+ desc: null
378
+ value: steps
379
+ logging_first_step:
380
+ desc: null
381
+ value: true
382
+ logging_steps:
383
+ desc: null
384
+ value: 25
385
+ logging_nan_inf_filter:
386
+ desc: null
387
+ value: true
388
+ save_strategy:
389
+ desc: null
390
+ value: steps
391
+ save_steps:
392
+ desc: null
393
+ value: 500
394
+ save_total_limit:
395
+ desc: null
396
+ value: 5000
397
+ save_safetensors:
398
+ desc: null
399
+ value: true
400
+ save_on_each_node:
401
+ desc: null
402
+ value: false
403
+ save_only_model:
404
+ desc: null
405
+ value: false
406
+ no_cuda:
407
+ desc: null
408
+ value: false
409
+ use_cpu:
410
+ desc: null
411
+ value: false
412
+ use_mps_device:
413
+ desc: null
414
+ value: false
415
+ seed:
416
+ desc: null
417
+ value: 42
418
+ data_seed:
419
+ desc: null
420
+ value: null
421
+ jit_mode_eval:
422
+ desc: null
423
+ value: false
424
+ use_ipex:
425
+ desc: null
426
+ value: false
427
+ bf16:
428
+ desc: null
429
+ value: true
430
+ fp16:
431
+ desc: null
432
+ value: false
433
+ fp16_opt_level:
434
+ desc: null
435
+ value: O1
436
+ half_precision_backend:
437
+ desc: null
438
+ value: auto
439
+ bf16_full_eval:
440
+ desc: null
441
+ value: false
442
+ fp16_full_eval:
443
+ desc: null
444
+ value: false
445
+ tf32:
446
+ desc: null
447
+ value: null
448
+ local_rank:
449
+ desc: null
450
+ value: 0
451
+ ddp_backend:
452
+ desc: null
453
+ value: null
454
+ tpu_num_cores:
455
+ desc: null
456
+ value: null
457
+ tpu_metrics_debug:
458
+ desc: null
459
+ value: false
460
+ debug:
461
+ desc: null
462
+ value: []
463
+ dataloader_drop_last:
464
+ desc: null
465
+ value: false
466
+ eval_steps:
467
+ desc: null
468
+ value: 5000
469
+ dataloader_num_workers:
470
+ desc: null
471
+ value: 0
472
+ dataloader_prefetch_factor:
473
+ desc: null
474
+ value: null
475
+ past_index:
476
+ desc: null
477
+ value: -1
478
+ run_name:
479
+ desc: null
480
+ value: ./
481
+ disable_tqdm:
482
+ desc: null
483
+ value: false
484
+ remove_unused_columns:
485
+ desc: null
486
+ value: true
487
+ label_names:
488
+ desc: null
489
+ value: null
490
+ load_best_model_at_end:
491
+ desc: null
492
+ value: false
493
+ metric_for_best_model:
494
+ desc: null
495
+ value: null
496
+ greater_is_better:
497
+ desc: null
498
+ value: null
499
+ ignore_data_skip:
500
+ desc: null
501
+ value: false
502
+ fsdp:
503
+ desc: null
504
+ value: []
505
+ fsdp_min_num_params:
506
+ desc: null
507
+ value: 0
508
+ fsdp_config:
509
+ desc: null
510
+ value:
511
+ min_num_params: 0
512
+ xla: false
513
+ xla_fsdp_v2: false
514
+ xla_fsdp_grad_ckpt: false
515
+ fsdp_transformer_layer_cls_to_wrap:
516
+ desc: null
517
+ value: null
518
+ accelerator_config:
519
+ desc: null
520
+ value:
521
+ split_batches: false
522
+ dispatch_batches: null
523
+ even_batches: true
524
+ use_seedable_sampler: true
525
+ gradient_accumulation_kwargs: null
526
+ deepspeed:
527
+ desc: null
528
+ value: null
529
+ label_smoothing_factor:
530
+ desc: null
531
+ value: 0.0
532
+ optim:
533
+ desc: null
534
+ value: adamw_torch
535
+ optim_args:
536
+ desc: null
537
+ value: null
538
+ adafactor:
539
+ desc: null
540
+ value: false
541
+ group_by_length:
542
+ desc: null
543
+ value: false
544
+ length_column_name:
545
+ desc: null
546
+ value: length
547
+ report_to:
548
+ desc: null
549
+ value:
550
+ - tensorboard
551
+ - wandb
552
+ ddp_find_unused_parameters:
553
+ desc: null
554
+ value: null
555
+ ddp_bucket_cap_mb:
556
+ desc: null
557
+ value: null
558
+ ddp_broadcast_buffers:
559
+ desc: null
560
+ value: null
561
+ dataloader_pin_memory:
562
+ desc: null
563
+ value: true
564
+ dataloader_persistent_workers:
565
+ desc: null
566
+ value: false
567
+ skip_memory_metrics:
568
+ desc: null
569
+ value: true
570
+ use_legacy_prediction_loop:
571
+ desc: null
572
+ value: false
573
+ push_to_hub:
574
+ desc: null
575
+ value: true
576
+ resume_from_checkpoint:
577
+ desc: null
578
+ value: null
579
+ hub_model_id:
580
+ desc: null
581
+ value: null
582
+ hub_strategy:
583
+ desc: null
584
+ value: every_save
585
+ hub_token:
586
+ desc: null
587
+ value: <HUB_TOKEN>
588
+ hub_private_repo:
589
+ desc: null
590
+ value: false
591
+ hub_always_push:
592
+ desc: null
593
+ value: false
594
+ gradient_checkpointing:
595
+ desc: null
596
+ value: true
597
+ gradient_checkpointing_kwargs:
598
+ desc: null
599
+ value:
600
+ use_reentrant: false
601
+ include_inputs_for_metrics:
602
+ desc: null
603
+ value: false
604
+ fp16_backend:
605
+ desc: null
606
+ value: auto
607
+ push_to_hub_model_id:
608
+ desc: null
609
+ value: null
610
+ push_to_hub_organization:
611
+ desc: null
612
+ value: null
613
+ push_to_hub_token:
614
+ desc: null
615
+ value: <PUSH_TO_HUB_TOKEN>
616
+ mp_parameters:
617
+ desc: null
618
+ value: ''
619
+ auto_find_batch_size:
620
+ desc: null
621
+ value: false
622
+ full_determinism:
623
+ desc: null
624
+ value: false
625
+ torchdynamo:
626
+ desc: null
627
+ value: null
628
+ ray_scope:
629
+ desc: null
630
+ value: last
631
+ ddp_timeout:
632
+ desc: null
633
+ value: 7200
634
+ torch_compile:
635
+ desc: null
636
+ value: false
637
+ torch_compile_backend:
638
+ desc: null
639
+ value: null
640
+ torch_compile_mode:
641
+ desc: null
642
+ value: null
643
+ dispatch_batches:
644
+ desc: null
645
+ value: null
646
+ split_batches:
647
+ desc: null
648
+ value: null
649
+ include_tokens_per_second:
650
+ desc: null
651
+ value: false
652
+ include_num_input_tokens_seen:
653
+ desc: null
654
+ value: false
655
+ neftune_noise_alpha:
656
+ desc: null
657
+ value: null
658
+ optim_target_modules:
659
+ desc: null
660
+ value: null
661
+ max_seq_length:
662
+ desc: null
663
+ value: 2048
wandb/run-20240424_164324-xfbnm7qo/files/output.log ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0%| | 0/20000 [00:00<?, ?it/s]/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
2
+ warnings.warn(
3
+ 0%| | 1/20000 [00:03<16:45:22, 3.02s/it]
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+ 0%| | 25/20000 [00:49<10:30:28, 1.89s/it]
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+ 0%|▏ | 50/20000 [01:36<10:30:25, 1.90s/it]
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+ 0%|▎ | 75/20000 [02:23<10:25:09, 1.88s/it]
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+ 0%|▍ | 100/20000 [03:10<10:21:12, 1.87s/it]
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+ 1%|▍ | 125/20000 [03:57<10:17:00, 1.86s/it]
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+ 1%|▌ | 150/20000 [04:43<10:12:27, 1.85s/it]
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+ 1%|▋ | 174/20000 [05:28<10:11:10, 1.85s/it]
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+ 1%|▊ | 199/20000 [06:14<10:07:08, 1.84s/it]
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+ 1%|▊ | 225/20000 [07:02<10:09:09, 1.85s/it]
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+ 1%|▉ | 250/20000 [07:48<10:05:59, 1.84s/it]
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+ 1%|█ | 274/20000 [08:32<10:02:25, 1.83s/it]
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+ 1%|█▏ | 299/20000 [09:18<10:01:03, 1.83s/it]
294
+
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+ 2%|█▏ | 324/20000 [10:04<10:02:02, 1.84s/it]
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+ 2%|█▎ | 350/20000 [10:52<9:54:57, 1.82s/it]
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+ 2%|█▍ | 375/20000 [11:37<10:00:16, 1.84s/it]
366
+
367
+
368
+
369
+
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+ 2%|█▌ | 400/20000 [12:23<9:56:59, 1.83s/it]
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+
399
+
400
+
401
+
402
+
403
+
404
+
405
+
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+ 2%|█▋ | 425/20000 [13:09<9:54:49, 1.82s/it]
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
+
437
+ 2%|█▋ | 450/20000 [13:54<9:56:31, 1.83s/it]
438
+
439
+
440
+
441
+
442
+
443
+
444
+
445
+
446
+
447
+
448
+
449
+
450
+
451
+
452
+
453
+
454
+
455
+
456
+
457
+
458
+
459
+
460
+ 2%|█▊ | 474/20000 [14:38<9:55:46, 1.83s/it]
461
+
462
+
463
+
464
+
465
+
466
+
467
+
468
+
469
+
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+
483
+ 2%|█▉ | 500/20000 [15:26<9:52:31, 1.82s/it][INFO|trainer.py:3304] 2024-04-24 16:58:56,780 >> Saving model checkpoint to ./checkpoint-500
484
+ [INFO|configuration_utils.py:471] 2024-04-24 16:58:56,784 >> Configuration saved in ./checkpoint-500/config.json
485
+ [INFO|configuration_utils.py:697] 2024-04-24 16:58:56,788 >> Configuration saved in ./checkpoint-500/generation_config.json
486
+ {'loss': 2.0773, 'grad_norm': 4.6875, 'learning_rate': 0.0001, 'epoch': 0.12}
487
+ [INFO|modeling_utils.py:2590] 2024-04-24 16:59:01,066 >> Model weights saved in ./checkpoint-500/model.safetensors
488
+ [INFO|tokenization_utils_base.py:2488] 2024-04-24 16:59:01,079 >> tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
489
+ [INFO|tokenization_utils_base.py:2497] 2024-04-24 16:59:01,081 >> Special tokens file saved in ./checkpoint-500/special_tokens_map.json
490
+ [INFO|tokenization_utils_base.py:2488] 2024-04-24 16:59:11,382 >> tokenizer config file saved in ./tokenizer_config.json
491
+ [INFO|tokenization_utils_base.py:2497] 2024-04-24 16:59:11,384 >> Special tokens file saved in ./special_tokens_map.json
492
+ /fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
493
+ warnings.warn(
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
502
+
503
+
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+ 3%|██ | 524/20000 [16:24<9:52:57, 1.83s/it]
516
+
517
+
518
+
519
+
520
+
521
+
522
+
wandb/run-20240424_164324-xfbnm7qo/files/requirements.txt ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ accelerate==0.29.3
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ annotated-types==0.6.0
6
+ anyio==4.2.0
7
+ appdirs==1.4.4
8
+ argon2-cffi-bindings==21.2.0
9
+ argon2-cffi==23.1.0
10
+ arrow==1.3.0
11
+ asttokens==2.4.1
12
+ astunparse==1.6.3
13
+ async-lru==2.0.4
14
+ attrs==23.1.0
15
+ audioread==3.0.1
16
+ babel==2.14.0
17
+ beautifulsoup4==4.12.3
18
+ bitsandbytes==0.43.1
19
+ bleach==6.1.0
20
+ brotli==1.0.9
21
+ cachetools==5.3.2
22
+ certifi==2023.11.17
23
+ cffi==1.16.0
24
+ chardet==5.2.0
25
+ charset-normalizer==2.0.4
26
+ click==8.1.7
27
+ comm==0.2.1
28
+ cryptography==41.0.7
29
+ datasets==2.18.1.dev0
30
+ debugpy==1.8.1
31
+ decorator==5.1.1
32
+ deepspeed==0.12.2
33
+ defusedxml==0.7.1
34
+ dill==0.3.7
35
+ docker-pycreds==0.4.0
36
+ docstring-parser==0.15
37
+ einops==0.7.0
38
+ evaluate==0.4.0
39
+ executing==2.0.1
40
+ fastjsonschema==2.19.1
41
+ filelock==3.13.1
42
+ flatbuffers==23.5.26
43
+ fqdn==1.5.1
44
+ frozenlist==1.4.1
45
+ fsspec==2023.10.0
46
+ gast==0.5.4
47
+ gitdb==4.0.11
48
+ gitpython==3.1.40
49
+ gmpy2==2.1.2
50
+ google-auth-oauthlib==1.2.0
51
+ google-auth==2.26.1
52
+ google-pasta==0.2.0
53
+ grpcio==1.60.0
54
+ h11==0.14.0
55
+ h5py==3.10.0
56
+ hf-transfer==0.1.5
57
+ hjson==3.1.0
58
+ httpcore==1.0.2
59
+ httpx==0.26.0
60
+ huggingface-hub==0.22.2
61
+ idna==3.4
62
+ ipdb==0.13.13
63
+ ipykernel==6.29.2
64
+ ipython==8.21.0
65
+ isoduration==20.11.0
66
+ jedi==0.19.1
67
+ jinja2==3.1.2
68
+ jiwer==3.0.3
69
+ joblib==1.3.2
70
+ json5==0.9.14
71
+ jsonpointer==2.4
72
+ jsonschema-specifications==2023.12.1
73
+ jsonschema==4.21.1
74
+ jupyter-client==8.6.0
75
+ jupyter-core==5.7.1
76
+ jupyter-events==0.9.0
77
+ jupyter-lsp==2.2.2
78
+ jupyter-server-terminals==0.5.2
79
+ jupyter-server==2.12.5
80
+ jupyterlab-pygments==0.3.0
81
+ jupyterlab-server==2.25.2
82
+ jupyterlab==4.1.1
83
+ keras==2.15.0
84
+ lazy-loader==0.3
85
+ libclang==16.0.6
86
+ librosa==0.10.1
87
+ llvmlite==0.41.1
88
+ markdown-it-py==3.0.0
89
+ markdown==3.5.1
90
+ markupsafe==2.1.1
91
+ matplotlib-inline==0.1.6
92
+ mdurl==0.1.2
93
+ mistune==3.0.2
94
+ mkl-fft==1.3.8
95
+ mkl-random==1.2.4
96
+ mkl-service==2.4.0
97
+ ml-dtypes==0.2.0
98
+ mpmath==1.3.0
99
+ msgpack==1.0.7
100
+ multidict==6.0.4
101
+ multiprocess==0.70.15
102
+ nbclient==0.9.0
103
+ nbconvert==7.16.0
104
+ nbformat==5.9.2
105
+ nest-asyncio==1.6.0
106
+ networkx==3.1
107
+ ninja==1.11.1.1
108
+ nltk==3.8.1
109
+ notebook-shim==0.2.3
110
+ numba==0.58.1
111
+ numpy==1.26.2
112
+ nvidia-cublas-cu12==12.1.3.1
113
+ nvidia-cuda-cupti-cu12==12.1.105
114
+ nvidia-cuda-nvrtc-cu12==12.1.105
115
+ nvidia-cuda-runtime-cu12==12.1.105
116
+ nvidia-cudnn-cu12==8.9.2.26
117
+ nvidia-cufft-cu12==11.0.2.54
118
+ nvidia-curand-cu12==10.3.2.106
119
+ nvidia-cusolver-cu12==11.4.5.107
120
+ nvidia-cusparse-cu12==12.1.0.106
121
+ nvidia-nccl-cu12==2.20.5
122
+ nvidia-nvjitlink-cu12==12.3.101
123
+ nvidia-nvtx-cu12==12.1.105
124
+ oauthlib==3.2.2
125
+ opt-einsum==3.3.0
126
+ overrides==7.7.0
127
+ packaging==23.2
128
+ pandas==2.1.4
129
+ pandocfilters==1.5.1
130
+ parso==0.8.3
131
+ peft==0.7.1
132
+ pexpect==4.9.0
133
+ pillow==10.2.0
134
+ pip==24.0
135
+ platformdirs==4.1.0
136
+ pooch==1.8.0
137
+ prometheus-client==0.19.0
138
+ prompt-toolkit==3.0.43
139
+ protobuf==3.20.2
140
+ psutil==5.9.7
141
+ ptyprocess==0.7.0
142
+ pure-eval==0.2.2
143
+ py-cpuinfo==9.0.0
144
+ pyarrow-hotfix==0.6
145
+ pyarrow==14.0.2
146
+ pyasn1-modules==0.3.0
147
+ pyasn1==0.5.1
148
+ pycparser==2.21
149
+ pydantic-core==2.16.1
150
+ pydantic==2.6.0
151
+ pygments==2.17.2
152
+ pynvml==11.5.0
153
+ pyopenssl==23.2.0
154
+ pysocks==1.7.1
155
+ python-dateutil==2.8.2
156
+ python-json-logger==2.0.7
157
+ pytorch-triton==3.0.0+989adb9a29
158
+ pytz==2023.3.post1
159
+ pyyaml==6.0.1
160
+ pyzmq==25.1.2
161
+ rapidfuzz==3.6.1
162
+ referencing==0.33.0
163
+ regex==2023.12.25
164
+ requests-oauthlib==1.3.1
165
+ requests==2.31.0
166
+ responses==0.18.0
167
+ rfc3339-validator==0.1.4
168
+ rfc3986-validator==0.1.1
169
+ rich==13.7.0
170
+ rpds-py==0.17.1
171
+ rsa==4.9
172
+ safetensors==0.4.1
173
+ scikit-learn==1.3.2
174
+ scipy==1.11.4
175
+ send2trash==1.8.2
176
+ sentencepiece==0.1.99
177
+ sentry-sdk==1.39.1
178
+ setproctitle==1.3.3
179
+ setuptools==68.2.2
180
+ shtab==1.6.5
181
+ six==1.16.0
182
+ smmap==5.0.1
183
+ sniffio==1.3.0
184
+ soundfile==0.12.1
185
+ soupsieve==2.5
186
+ soxr==0.3.7
187
+ stack-data==0.6.3
188
+ sympy==1.12
189
+ tensorboard-data-server==0.7.2
190
+ tensorboard==2.15.1
191
+ tensorflow-cpu==2.15.0.post1
192
+ tensorflow-estimator==2.15.0
193
+ tensorflow-io-gcs-filesystem==0.35.0
194
+ termcolor==2.4.0
195
+ terminado==0.18.0
196
+ threadpoolctl==3.2.0
197
+ tinycss2==1.2.1
198
+ tokenizers==0.15.0
199
+ torch==2.4.0.dev20240323+cu121
200
+ torchaudio==2.2.0.dev20240323+cu121
201
+ torchvision==0.19.0.dev20240323+cu121
202
+ tornado==6.4
203
+ tqdm==4.66.1
204
+ traitlets==5.14.1
205
+ transformers==4.39.0.dev0
206
+ triton==2.2.0
207
+ trl==0.8.6
208
+ types-python-dateutil==2.8.19.20240106
209
+ typing-extensions==4.10.0
210
+ tyro==0.7.0
211
+ tzdata==2023.3
212
+ uri-template==1.3.0
213
+ urllib3==1.26.18
214
+ wandb==0.16.1
215
+ wcwidth==0.2.13
216
+ webcolors==1.13
217
+ webencodings==0.5.1
218
+ websocket-client==1.7.0
219
+ werkzeug==3.0.1
220
+ wheel==0.41.2
221
+ wrapt==1.14.1
222
+ xxhash==3.4.1
223
+ yarl==1.9.4
wandb/run-20240424_164324-xfbnm7qo/files/wandb-metadata.json ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
3
+ "python": "3.11.5",
4
+ "heartbeatAt": "2024-04-24T16:43:25.058035",
5
+ "startedAt": "2024-04-24T16:43:24.523748",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "config_full.yaml"
10
+ ],
11
+ "state": "running",
12
+ "program": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py",
13
+ "codePathLocal": "run_sft.py",
14
+ "codePath": "run_sft.py",
15
+ "git": {
16
+ "remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat",
17
+ "commit": "cbea69c6b95c970317a1e47c3f614b55b33f8ed9"
18
+ },
19
+ "email": null,
20
+ "root": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat",
21
+ "host": "ip-26-0-162-233",
22
+ "username": "sanchit",
23
+ "executable": "/fsx/sanchit/miniconda3/envs/venv/bin/python",
24
+ "cpu_count": 96,
25
+ "cpu_count_logical": 96,
26
+ "cpu_freq": {
27
+ "current": 2729.8387291666663,
28
+ "min": 0.0,
29
+ "max": 0.0
30
+ },
31
+ "cpu_freq_per_core": [
32
+ {
33
+ "current": 2650.0,
34
+ "min": 0.0,
35
+ "max": 0.0
36
+ },
37
+ {
38
+ "current": 2650.0,
39
+ "min": 0.0,
40
+ "max": 0.0
41
+ },
42
+ {
43
+ "current": 3598.161,
44
+ "min": 0.0,
45
+ "max": 0.0
46
+ },
47
+ {
48
+ "current": 2650.0,
49
+ "min": 0.0,
50
+ "max": 0.0
51
+ },
52
+ {
53
+ "current": 3584.12,
54
+ "min": 0.0,
55
+ "max": 0.0
56
+ },
57
+ {
58
+ "current": 2650.0,
59
+ "min": 0.0,
60
+ "max": 0.0
61
+ },
62
+ {
63
+ "current": 2650.0,
64
+ "min": 0.0,
65
+ "max": 0.0
66
+ },
67
+ {
68
+ "current": 2650.0,
69
+ "min": 0.0,
70
+ "max": 0.0
71
+ },
72
+ {
73
+ "current": 2650.0,
74
+ "min": 0.0,
75
+ "max": 0.0
76
+ },
77
+ {
78
+ "current": 2650.0,
79
+ "min": 0.0,
80
+ "max": 0.0
81
+ },
82
+ {
83
+ "current": 2650.0,
84
+ "min": 0.0,
85
+ "max": 0.0
86
+ },
87
+ {
88
+ "current": 3598.175,
89
+ "min": 0.0,
90
+ "max": 0.0
91
+ },
92
+ {
93
+ "current": 2650.0,
94
+ "min": 0.0,
95
+ "max": 0.0
96
+ },
97
+ {
98
+ "current": 2650.0,
99
+ "min": 0.0,
100
+ "max": 0.0
101
+ },
102
+ {
103
+ "current": 2650.0,
104
+ "min": 0.0,
105
+ "max": 0.0
106
+ },
107
+ {
108
+ "current": 2650.0,
109
+ "min": 0.0,
110
+ "max": 0.0
111
+ },
112
+ {
113
+ "current": 2650.0,
114
+ "min": 0.0,
115
+ "max": 0.0
116
+ },
117
+ {
118
+ "current": 3598.329,
119
+ "min": 0.0,
120
+ "max": 0.0
121
+ },
122
+ {
123
+ "current": 2650.0,
124
+ "min": 0.0,
125
+ "max": 0.0
126
+ },
127
+ {
128
+ "current": 2650.0,
129
+ "min": 0.0,
130
+ "max": 0.0
131
+ },
132
+ {
133
+ "current": 2650.0,
134
+ "min": 0.0,
135
+ "max": 0.0
136
+ },
137
+ {
138
+ "current": 2650.0,
139
+ "min": 0.0,
140
+ "max": 0.0
141
+ },
142
+ {
143
+ "current": 2650.0,
144
+ "min": 0.0,
145
+ "max": 0.0
146
+ },
147
+ {
148
+ "current": 2650.0,
149
+ "min": 0.0,
150
+ "max": 0.0
151
+ },
152
+ {
153
+ "current": 3596.81,
154
+ "min": 0.0,
155
+ "max": 0.0
156
+ },
157
+ {
158
+ "current": 2650.0,
159
+ "min": 0.0,
160
+ "max": 0.0
161
+ },
162
+ {
163
+ "current": 2650.0,
164
+ "min": 0.0,
165
+ "max": 0.0
166
+ },
167
+ {
168
+ "current": 3598.102,
169
+ "min": 0.0,
170
+ "max": 0.0
171
+ },
172
+ {
173
+ "current": 2650.0,
174
+ "min": 0.0,
175
+ "max": 0.0
176
+ },
177
+ {
178
+ "current": 2650.0,
179
+ "min": 0.0,
180
+ "max": 0.0
181
+ },
182
+ {
183
+ "current": 2650.0,
184
+ "min": 0.0,
185
+ "max": 0.0
186
+ },
187
+ {
188
+ "current": 2650.0,
189
+ "min": 0.0,
190
+ "max": 0.0
191
+ },
192
+ {
193
+ "current": 2650.0,
194
+ "min": 0.0,
195
+ "max": 0.0
196
+ },
197
+ {
198
+ "current": 2650.0,
199
+ "min": 0.0,
200
+ "max": 0.0
201
+ },
202
+ {
203
+ "current": 2650.0,
204
+ "min": 0.0,
205
+ "max": 0.0
206
+ },
207
+ {
208
+ "current": 2650.0,
209
+ "min": 0.0,
210
+ "max": 0.0
211
+ },
212
+ {
213
+ "current": 3596.611,
214
+ "min": 0.0,
215
+ "max": 0.0
216
+ },
217
+ {
218
+ "current": 2650.0,
219
+ "min": 0.0,
220
+ "max": 0.0
221
+ },
222
+ {
223
+ "current": 2650.0,
224
+ "min": 0.0,
225
+ "max": 0.0
226
+ },
227
+ {
228
+ "current": 2650.0,
229
+ "min": 0.0,
230
+ "max": 0.0
231
+ },
232
+ {
233
+ "current": 3598.198,
234
+ "min": 0.0,
235
+ "max": 0.0
236
+ },
237
+ {
238
+ "current": 2650.0,
239
+ "min": 0.0,
240
+ "max": 0.0
241
+ },
242
+ {
243
+ "current": 2650.0,
244
+ "min": 0.0,
245
+ "max": 0.0
246
+ },
247
+ {
248
+ "current": 2650.0,
249
+ "min": 0.0,
250
+ "max": 0.0
251
+ },
252
+ {
253
+ "current": 2650.0,
254
+ "min": 0.0,
255
+ "max": 0.0
256
+ },
257
+ {
258
+ "current": 2650.0,
259
+ "min": 0.0,
260
+ "max": 0.0
261
+ },
262
+ {
263
+ "current": 2650.0,
264
+ "min": 0.0,
265
+ "max": 0.0
266
+ },
267
+ {
268
+ "current": 2650.0,
269
+ "min": 0.0,
270
+ "max": 0.0
271
+ },
272
+ {
273
+ "current": 2650.0,
274
+ "min": 0.0,
275
+ "max": 0.0
276
+ },
277
+ {
278
+ "current": 2650.0,
279
+ "min": 0.0,
280
+ "max": 0.0
281
+ },
282
+ {
283
+ "current": 2650.0,
284
+ "min": 0.0,
285
+ "max": 0.0
286
+ },
287
+ {
288
+ "current": 2650.0,
289
+ "min": 0.0,
290
+ "max": 0.0
291
+ },
292
+ {
293
+ "current": 2650.0,
294
+ "min": 0.0,
295
+ "max": 0.0
296
+ },
297
+ {
298
+ "current": 2650.0,
299
+ "min": 0.0,
300
+ "max": 0.0
301
+ },
302
+ {
303
+ "current": 2650.0,
304
+ "min": 0.0,
305
+ "max": 0.0
306
+ },
307
+ {
308
+ "current": 2650.0,
309
+ "min": 0.0,
310
+ "max": 0.0
311
+ },
312
+ {
313
+ "current": 2650.0,
314
+ "min": 0.0,
315
+ "max": 0.0
316
+ },
317
+ {
318
+ "current": 2650.0,
319
+ "min": 0.0,
320
+ "max": 0.0
321
+ },
322
+ {
323
+ "current": 2650.0,
324
+ "min": 0.0,
325
+ "max": 0.0
326
+ },
327
+ {
328
+ "current": 2650.0,
329
+ "min": 0.0,
330
+ "max": 0.0
331
+ },
332
+ {
333
+ "current": 2650.0,
334
+ "min": 0.0,
335
+ "max": 0.0
336
+ },
337
+ {
338
+ "current": 2650.0,
339
+ "min": 0.0,
340
+ "max": 0.0
341
+ },
342
+ {
343
+ "current": 2650.0,
344
+ "min": 0.0,
345
+ "max": 0.0
346
+ },
347
+ {
348
+ "current": 2650.0,
349
+ "min": 0.0,
350
+ "max": 0.0
351
+ },
352
+ {
353
+ "current": 2650.0,
354
+ "min": 0.0,
355
+ "max": 0.0
356
+ },
357
+ {
358
+ "current": 2650.0,
359
+ "min": 0.0,
360
+ "max": 0.0
361
+ },
362
+ {
363
+ "current": 2650.0,
364
+ "min": 0.0,
365
+ "max": 0.0
366
+ },
367
+ {
368
+ "current": 2650.0,
369
+ "min": 0.0,
370
+ "max": 0.0
371
+ },
372
+ {
373
+ "current": 2650.0,
374
+ "min": 0.0,
375
+ "max": 0.0
376
+ },
377
+ {
378
+ "current": 2650.0,
379
+ "min": 0.0,
380
+ "max": 0.0
381
+ },
382
+ {
383
+ "current": 2650.0,
384
+ "min": 0.0,
385
+ "max": 0.0
386
+ },
387
+ {
388
+ "current": 2650.0,
389
+ "min": 0.0,
390
+ "max": 0.0
391
+ },
392
+ {
393
+ "current": 2650.0,
394
+ "min": 0.0,
395
+ "max": 0.0
396
+ },
397
+ {
398
+ "current": 2650.0,
399
+ "min": 0.0,
400
+ "max": 0.0
401
+ },
402
+ {
403
+ "current": 2650.0,
404
+ "min": 0.0,
405
+ "max": 0.0
406
+ },
407
+ {
408
+ "current": 2650.0,
409
+ "min": 0.0,
410
+ "max": 0.0
411
+ },
412
+ {
413
+ "current": 2650.0,
414
+ "min": 0.0,
415
+ "max": 0.0
416
+ },
417
+ {
418
+ "current": 2650.0,
419
+ "min": 0.0,
420
+ "max": 0.0
421
+ },
422
+ {
423
+ "current": 2650.0,
424
+ "min": 0.0,
425
+ "max": 0.0
426
+ },
427
+ {
428
+ "current": 2650.0,
429
+ "min": 0.0,
430
+ "max": 0.0
431
+ },
432
+ {
433
+ "current": 2650.0,
434
+ "min": 0.0,
435
+ "max": 0.0
436
+ },
437
+ {
438
+ "current": 2650.0,
439
+ "min": 0.0,
440
+ "max": 0.0
441
+ },
442
+ {
443
+ "current": 2650.0,
444
+ "min": 0.0,
445
+ "max": 0.0
446
+ },
447
+ {
448
+ "current": 2650.0,
449
+ "min": 0.0,
450
+ "max": 0.0
451
+ },
452
+ {
453
+ "current": 2650.0,
454
+ "min": 0.0,
455
+ "max": 0.0
456
+ },
457
+ {
458
+ "current": 2650.0,
459
+ "min": 0.0,
460
+ "max": 0.0
461
+ },
462
+ {
463
+ "current": 2650.0,
464
+ "min": 0.0,
465
+ "max": 0.0
466
+ },
467
+ {
468
+ "current": 2650.0,
469
+ "min": 0.0,
470
+ "max": 0.0
471
+ },
472
+ {
473
+ "current": 2650.0,
474
+ "min": 0.0,
475
+ "max": 0.0
476
+ },
477
+ {
478
+ "current": 2650.0,
479
+ "min": 0.0,
480
+ "max": 0.0
481
+ },
482
+ {
483
+ "current": 2650.0,
484
+ "min": 0.0,
485
+ "max": 0.0
486
+ },
487
+ {
488
+ "current": 2650.0,
489
+ "min": 0.0,
490
+ "max": 0.0
491
+ },
492
+ {
493
+ "current": 2650.0,
494
+ "min": 0.0,
495
+ "max": 0.0
496
+ },
497
+ {
498
+ "current": 2650.0,
499
+ "min": 0.0,
500
+ "max": 0.0
501
+ },
502
+ {
503
+ "current": 2650.0,
504
+ "min": 0.0,
505
+ "max": 0.0
506
+ },
507
+ {
508
+ "current": 2650.0,
509
+ "min": 0.0,
510
+ "max": 0.0
511
+ }
512
+ ],
513
+ "disk": {
514
+ "/": {
515
+ "total": 290.7472343444824,
516
+ "used": 59.25613021850586
517
+ }
518
+ },
519
+ "gpu": "NVIDIA H100 80GB HBM3",
520
+ "gpu_count": 8,
521
+ "gpu_devices": [
522
+ {
523
+ "name": "NVIDIA H100 80GB HBM3",
524
+ "memory_total": 85520809984
525
+ },
526
+ {
527
+ "name": "NVIDIA H100 80GB HBM3",
528
+ "memory_total": 85520809984
529
+ },
530
+ {
531
+ "name": "NVIDIA H100 80GB HBM3",
532
+ "memory_total": 85520809984
533
+ },
534
+ {
535
+ "name": "NVIDIA H100 80GB HBM3",
536
+ "memory_total": 85520809984
537
+ },
538
+ {
539
+ "name": "NVIDIA H100 80GB HBM3",
540
+ "memory_total": 85520809984
541
+ },
542
+ {
543
+ "name": "NVIDIA H100 80GB HBM3",
544
+ "memory_total": 85520809984
545
+ },
546
+ {
547
+ "name": "NVIDIA H100 80GB HBM3",
548
+ "memory_total": 85520809984
549
+ },
550
+ {
551
+ "name": "NVIDIA H100 80GB HBM3",
552
+ "memory_total": 85520809984
553
+ }
554
+ ],
555
+ "memory": {
556
+ "total": 1999.9855270385742
557
+ }
558
+ }
wandb/run-20240424_164324-xfbnm7qo/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 2.0215, "train/grad_norm": 4.125, "train/learning_rate": 9.987179487179488e-05, "train/epoch": 0.13, "train/global_step": 525, "_timestamp": 1713977997.0387745, "_runtime": 992.4967684745789, "_step": 21}
wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240424_164324-xfbnm7qo/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-24 16:43:24,533 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
2
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Configure stats pid to 1854033
3
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
4
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
5
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
8
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug.log
9
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log
10
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():564] calling init triggers
11
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
12
+ config: {}
13
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():614] starting backend
14
+ 2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():618] setting up manager
15
+ 2024-04-24 16:43:24,537 INFO MainThread:1854033 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-04-24 16:43:24,541 INFO MainThread:1854033 [wandb_init.py:init():624] backend started and connected
17
+ 2024-04-24 16:43:24,544 INFO MainThread:1854033 [wandb_init.py:init():716] updated telemetry
18
+ 2024-04-24 16:43:24,569 INFO MainThread:1854033 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
19
+ 2024-04-24 16:43:24,850 INFO MainThread:1854033 [wandb_run.py:_on_init():2254] communicating current version
20
+ 2024-04-24 16:43:24,896 INFO MainThread:1854033 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-04-24 16:43:24,896 INFO MainThread:1854033 [wandb_init.py:init():800] starting run threads in backend
23
+ 2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_console_start():2233] atexit reg
24
+ 2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
+ 2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
+ 2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2178] Redirects installed.
27
+ 2024-04-24 16:43:30,533 INFO MainThread:1854033 [wandb_init.py:init():841] run started, returning control to user process
28
+ 2024-04-24 16:43:30,535 INFO MainThread:1854033 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_16-42-31_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
wandb/run-20240424_164324-xfbnm7qo/run-xfbnm7qo.wandb ADDED
Binary file (297 kB). View file