Commit
•
4ea2eae
1
Parent(s):
cbea69c
Training in progress, step 500
Browse files- accelerate_config.yaml +18 -0
- alignment/__init__.py +12 -0
- alignment/__pycache__/__init__.cpython-311.pyc +0 -0
- alignment/__pycache__/configs.cpython-311.pyc +0 -0
- alignment/__pycache__/data.cpython-311.pyc +0 -0
- alignment/__pycache__/model_utils.cpython-311.pyc +0 -0
- alignment/configs.py +254 -0
- alignment/data.py +190 -0
- alignment/model_utils.py +119 -0
- alignment/release.py +106 -0
- config.json +26 -0
- config_full.yaml +45 -0
- model.safetensors +3 -0
- run_sft.py +218 -0
- runs/Apr24_14-23-38_ip-26-0-162-233/events.out.tfevents.1713973415.ip-26-0-162-233.1840687.0 +3 -0
- runs/Apr24_16-42-31_ip-26-0-162-233/events.out.tfevents.1713977002.ip-26-0-162-233.1854033.0 +3 -0
- slurm_job.slurm +76 -0
- special_tokens_map.json +24 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +43 -0
- training_args.bin +3 -0
- wandb/debug-cli.sanchit.log +0 -0
- wandb/debug-internal.log +0 -0
- wandb/debug.log +28 -0
- wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml +300 -0
- wandb/run-20240424_154339-mwp0iutr/files/config.yaml +663 -0
- wandb/run-20240424_154339-mwp0iutr/files/output.log +131 -0
- wandb/run-20240424_154339-mwp0iutr/files/requirements.txt +223 -0
- wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json +558 -0
- wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json +1 -0
- wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log +209 -0
- wandb/run-20240424_154339-mwp0iutr/logs/debug.log +29 -0
- wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb +0 -0
- wandb/run-20240424_164324-xfbnm7qo/files/conda-environment.yaml +300 -0
- wandb/run-20240424_164324-xfbnm7qo/files/config.yaml +663 -0
- wandb/run-20240424_164324-xfbnm7qo/files/output.log +522 -0
- wandb/run-20240424_164324-xfbnm7qo/files/requirements.txt +223 -0
- wandb/run-20240424_164324-xfbnm7qo/files/wandb-metadata.json +558 -0
- wandb/run-20240424_164324-xfbnm7qo/files/wandb-summary.json +1 -0
- wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log +0 -0
- wandb/run-20240424_164324-xfbnm7qo/logs/debug.log +28 -0
- wandb/run-20240424_164324-xfbnm7qo/run-xfbnm7qo.wandb +0 -0
accelerate_config.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
compute_environment: LOCAL_MACHINE
|
2 |
+
debug: false
|
3 |
+
distributed_type: MULTI_GPU
|
4 |
+
downcast_bf16: 'no'
|
5 |
+
enable_cpu_affinity: false
|
6 |
+
gpu_ids: all
|
7 |
+
machine_rank: 0
|
8 |
+
main_training_function: main
|
9 |
+
mixed_precision: bf16
|
10 |
+
num_machines: 1
|
11 |
+
num_processes: 8
|
12 |
+
rdzv_backend: static
|
13 |
+
same_network: true
|
14 |
+
tpu_env: []
|
15 |
+
tpu_use_cluster: false
|
16 |
+
tpu_use_sudo: false
|
17 |
+
use_cpu: false
|
18 |
+
|
alignment/__init__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = "0.3.0.dev0"
|
2 |
+
|
3 |
+
from .configs import DataArguments, DPOConfig, H4ArgumentParser, ModelArguments, SFTConfig
|
4 |
+
from .data import apply_chat_template, get_datasets
|
5 |
+
from .model_utils import (
|
6 |
+
get_checkpoint,
|
7 |
+
get_kbit_device_map,
|
8 |
+
get_peft_config,
|
9 |
+
get_quantization_config,
|
10 |
+
get_tokenizer,
|
11 |
+
is_adapter_model,
|
12 |
+
)
|
alignment/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (752 Bytes). View file
|
|
alignment/__pycache__/configs.cpython-311.pyc
ADDED
Binary file (14.1 kB). View file
|
|
alignment/__pycache__/data.cpython-311.pyc
ADDED
Binary file (9.06 kB). View file
|
|
alignment/__pycache__/model_utils.cpython-311.pyc
ADDED
Binary file (5.05 kB). View file
|
|
alignment/configs.py
ADDED
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
import dataclasses
|
16 |
+
import os
|
17 |
+
import sys
|
18 |
+
from dataclasses import dataclass, field
|
19 |
+
from typing import Any, Dict, List, NewType, Optional, Tuple
|
20 |
+
|
21 |
+
import transformers
|
22 |
+
from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, HfArgumentParser
|
23 |
+
|
24 |
+
|
25 |
+
MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
|
26 |
+
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
27 |
+
|
28 |
+
|
29 |
+
DataClassType = NewType("DataClassType", Any)
|
30 |
+
|
31 |
+
|
32 |
+
class H4ArgumentParser(HfArgumentParser):
|
33 |
+
def parse_yaml_and_args(self, yaml_arg: str, other_args: Optional[List[str]] = None) -> List[dataclass]:
|
34 |
+
"""
|
35 |
+
Parse a YAML file and overwrite the default/loaded values with the values provided to the command line.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
yaml_arg (`str`):
|
39 |
+
The path to the config file used
|
40 |
+
other_args (`List[str]`, *optional`):
|
41 |
+
A list of strings to parse as command line arguments, e.g. ['--arg=val', '--arg2=val2'].
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
[`List[dataclass]`]: a list of dataclasses with the values from the YAML file and the command line
|
45 |
+
"""
|
46 |
+
arg_list = self.parse_yaml_file(os.path.abspath(yaml_arg))
|
47 |
+
|
48 |
+
outputs = []
|
49 |
+
# strip other args list into dict of key-value pairs
|
50 |
+
other_args = {arg.split("=")[0].strip("-"): arg.split("=")[1] for arg in other_args}
|
51 |
+
used_args = {}
|
52 |
+
|
53 |
+
# overwrite the default/loaded value with the value provided to the command line
|
54 |
+
# adapted from https://github.com/huggingface/transformers/blob/d0b5002378daabf62769159add3e7d66d3f83c3b/src/transformers/hf_argparser.py#L327
|
55 |
+
for data_yaml, data_class in zip(arg_list, self.dataclass_types):
|
56 |
+
keys = {f.name for f in dataclasses.fields(data_yaml) if f.init}
|
57 |
+
inputs = {k: v for k, v in vars(data_yaml).items() if k in keys}
|
58 |
+
for arg, val in other_args.items():
|
59 |
+
# add only if in keys
|
60 |
+
if arg in keys:
|
61 |
+
base_type = data_yaml.__dataclass_fields__[arg].type
|
62 |
+
inputs[arg] = val
|
63 |
+
|
64 |
+
# cast type for ints, floats (default to strings)
|
65 |
+
if base_type in [int, float]:
|
66 |
+
inputs[arg] = base_type(val)
|
67 |
+
|
68 |
+
if base_type == List[str]:
|
69 |
+
inputs[arg] = [str(v) for v in val.split(",")]
|
70 |
+
|
71 |
+
# bool of a non-empty string is True, so we manually check for bools
|
72 |
+
if base_type == bool:
|
73 |
+
if val in ["true", "True"]:
|
74 |
+
inputs[arg] = True
|
75 |
+
else:
|
76 |
+
inputs[arg] = False
|
77 |
+
|
78 |
+
# add to used-args so we can check if double add
|
79 |
+
if arg not in used_args:
|
80 |
+
used_args[arg] = val
|
81 |
+
else:
|
82 |
+
raise ValueError(f"Duplicate argument provided: {arg}, may cause unexpected behavior")
|
83 |
+
|
84 |
+
obj = data_class(**inputs)
|
85 |
+
outputs.append(obj)
|
86 |
+
|
87 |
+
return outputs
|
88 |
+
|
89 |
+
def parse(self) -> DataClassType | Tuple[DataClassType]:
|
90 |
+
if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
|
91 |
+
# If we pass only one argument to the script and it's the path to a YAML file,
|
92 |
+
# let's parse it to get our arguments.
|
93 |
+
output = self.parse_yaml_file(os.path.abspath(sys.argv[1]))
|
94 |
+
# parse command line args and yaml file
|
95 |
+
elif len(sys.argv) > 2 and sys.argv[1].endswith(".yaml"):
|
96 |
+
output = self.parse_yaml_and_args(os.path.abspath(sys.argv[1]), sys.argv[2:])
|
97 |
+
# parse command line args only
|
98 |
+
else:
|
99 |
+
output = self.parse_args_into_dataclasses()
|
100 |
+
|
101 |
+
if len(output) == 1:
|
102 |
+
output = output[0]
|
103 |
+
return output
|
104 |
+
|
105 |
+
|
106 |
+
@dataclass
|
107 |
+
class ModelArguments:
|
108 |
+
"""
|
109 |
+
Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
|
110 |
+
"""
|
111 |
+
|
112 |
+
base_model_revision: Optional[str] = field(
|
113 |
+
default=None,
|
114 |
+
metadata={"help": ("The base model checkpoint for weights initialization with PEFT adatpers.")},
|
115 |
+
)
|
116 |
+
model_name_or_path: Optional[str] = field(
|
117 |
+
default=None,
|
118 |
+
metadata={
|
119 |
+
"help": (
|
120 |
+
"The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
|
121 |
+
)
|
122 |
+
},
|
123 |
+
)
|
124 |
+
model_revision: str = field(
|
125 |
+
default="main",
|
126 |
+
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
127 |
+
)
|
128 |
+
model_code_revision: str = field(default=None, metadata={"help": "The branch of the IFT model"})
|
129 |
+
torch_dtype: Optional[str] = field(
|
130 |
+
default=None,
|
131 |
+
metadata={
|
132 |
+
"help": (
|
133 |
+
"Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
|
134 |
+
"dtype will be automatically derived from the model's weights."
|
135 |
+
),
|
136 |
+
"choices": ["auto", "bfloat16", "float16", "float32"],
|
137 |
+
},
|
138 |
+
)
|
139 |
+
trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
|
140 |
+
use_flash_attention_2: bool = field(
|
141 |
+
default=False,
|
142 |
+
metadata={
|
143 |
+
"help": (
|
144 |
+
"Whether to use flash attention 2. You must install this manually by running `pip install flash-attn --no-build-isolation`"
|
145 |
+
)
|
146 |
+
},
|
147 |
+
)
|
148 |
+
use_peft: bool = field(
|
149 |
+
default=False,
|
150 |
+
metadata={"help": ("Whether to use PEFT or not for training.")},
|
151 |
+
)
|
152 |
+
lora_r: Optional[int] = field(
|
153 |
+
default=16,
|
154 |
+
metadata={"help": ("LoRA R value.")},
|
155 |
+
)
|
156 |
+
lora_alpha: Optional[int] = field(
|
157 |
+
default=32,
|
158 |
+
metadata={"help": ("LoRA alpha.")},
|
159 |
+
)
|
160 |
+
lora_dropout: Optional[float] = field(
|
161 |
+
default=0.05,
|
162 |
+
metadata={"help": ("LoRA dropout.")},
|
163 |
+
)
|
164 |
+
lora_target_modules: Optional[List[str]] = field(
|
165 |
+
default=None,
|
166 |
+
metadata={"help": ("LoRA target modules.")},
|
167 |
+
)
|
168 |
+
lora_modules_to_save: Optional[List[str]] = field(
|
169 |
+
default=None,
|
170 |
+
metadata={"help": ("Model layers to unfreeze & train")},
|
171 |
+
)
|
172 |
+
load_in_8bit: bool = field(default=False, metadata={"help": "use 8 bit precision"})
|
173 |
+
load_in_4bit: bool = field(default=False, metadata={"help": "use 4 bit precision"})
|
174 |
+
|
175 |
+
bnb_4bit_quant_type: Optional[str] = field(
|
176 |
+
default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
|
177 |
+
)
|
178 |
+
use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
|
179 |
+
|
180 |
+
def __post_init__(self):
|
181 |
+
if self.load_in_8bit and self.load_in_4bit:
|
182 |
+
raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
|
183 |
+
|
184 |
+
|
185 |
+
@dataclass
|
186 |
+
class DataArguments:
|
187 |
+
"""
|
188 |
+
Arguments pertaining to what data we are going to input our model for training and eval.
|
189 |
+
"""
|
190 |
+
|
191 |
+
chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
|
192 |
+
dataset_mixer: Optional[Dict[str, float]] = field(
|
193 |
+
default=None,
|
194 |
+
metadata={"help": ("Datasets and their proportions to be used for training ift/rl.")},
|
195 |
+
)
|
196 |
+
dataset_splits: Optional[List[str]] = field(
|
197 |
+
default_factory=lambda: ["train", "test"],
|
198 |
+
metadata={"help": ("List of train test splits to use in the dataset")},
|
199 |
+
)
|
200 |
+
preprocessing_num_workers: Optional[int] = field(
|
201 |
+
default=None,
|
202 |
+
metadata={"help": "The number of processes to use for the preprocessing."},
|
203 |
+
)
|
204 |
+
truncation_side: Optional[str] = field(
|
205 |
+
default=None, metadata={"help": "Truncation side to use for the tokenizer."}
|
206 |
+
)
|
207 |
+
|
208 |
+
|
209 |
+
@dataclass
|
210 |
+
class SFTConfig(transformers.TrainingArguments):
|
211 |
+
"""
|
212 |
+
Arguments related to the training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
|
213 |
+
"""
|
214 |
+
|
215 |
+
max_seq_length: Optional[int] = field(
|
216 |
+
default=None,
|
217 |
+
metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
|
218 |
+
)
|
219 |
+
logging_first_step: bool = field(
|
220 |
+
default=True,
|
221 |
+
metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
|
222 |
+
)
|
223 |
+
optim: Optional[str] = field(default="adamw_torch")
|
224 |
+
|
225 |
+
|
226 |
+
@dataclass
|
227 |
+
class DPOConfig(transformers.TrainingArguments):
|
228 |
+
"""
|
229 |
+
Arguments related to the DPO training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
|
230 |
+
"""
|
231 |
+
|
232 |
+
beta: Optional[float] = field(
|
233 |
+
default=0.1,
|
234 |
+
metadata={"help": "The beta factor in DPO loss. Higher beta means less divergence from the initial policy."},
|
235 |
+
)
|
236 |
+
hub_model_revision: Optional[str] = field(
|
237 |
+
default="main",
|
238 |
+
metadata={"help": ("The Hub model branch to push the model to.")},
|
239 |
+
)
|
240 |
+
logging_first_step: bool = field(
|
241 |
+
default=True,
|
242 |
+
metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
|
243 |
+
)
|
244 |
+
max_prompt_length: Optional[int] = field(
|
245 |
+
default=None,
|
246 |
+
metadata={"help": ("For DPO, the maximum length of the prompt to use for conditioning the model.")},
|
247 |
+
)
|
248 |
+
max_length: Optional[int] = field(
|
249 |
+
default=None,
|
250 |
+
metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
|
251 |
+
)
|
252 |
+
optim: Optional[str] = field(default="rmsprop")
|
253 |
+
remove_unused_columns: bool = field(default=False)
|
254 |
+
loss_type: Optional[str] = field(default="sigmoid", metadata={"help": ("The loss type for DPO.")})
|
alignment/data.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
import os
|
16 |
+
from typing import List, Literal, Optional
|
17 |
+
|
18 |
+
from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
|
19 |
+
from datasets.builder import DatasetGenerationError
|
20 |
+
|
21 |
+
from .configs import DataArguments
|
22 |
+
|
23 |
+
|
24 |
+
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
|
25 |
+
|
26 |
+
|
27 |
+
def maybe_insert_system_message(messages, tokenizer):
|
28 |
+
if messages[0]["role"] == "system":
|
29 |
+
return
|
30 |
+
|
31 |
+
# chat template can be one of two attributes, we check in order
|
32 |
+
chat_template = tokenizer.chat_template
|
33 |
+
if chat_template is None:
|
34 |
+
chat_template = tokenizer.default_chat_template
|
35 |
+
|
36 |
+
# confirm the jinja template refers to a system message before inserting
|
37 |
+
if "system" in chat_template:
|
38 |
+
messages.insert(0, {"role": "system", "content": ""})
|
39 |
+
|
40 |
+
|
41 |
+
def apply_chat_template(
|
42 |
+
example,
|
43 |
+
tokenizer,
|
44 |
+
task: Literal["sft", "generation", "rm", "dpo"],
|
45 |
+
):
|
46 |
+
if task in ["sft", "generation"]:
|
47 |
+
messages = example["messages"]
|
48 |
+
# We add an empty system message if there is none
|
49 |
+
maybe_insert_system_message(messages, tokenizer)
|
50 |
+
example["text"] = tokenizer.apply_chat_template(
|
51 |
+
messages, tokenize=False, add_generation_prompt=True if task == "generation" else False
|
52 |
+
)
|
53 |
+
elif task == "rm":
|
54 |
+
if all(k in example.keys() for k in ("chosen", "rejected")):
|
55 |
+
chosen_messages = example["chosen"]
|
56 |
+
rejected_messages = example["rejected"]
|
57 |
+
# We add an empty system message if there is none
|
58 |
+
maybe_insert_system_message(chosen_messages, tokenizer)
|
59 |
+
maybe_insert_system_message(rejected_messages, tokenizer)
|
60 |
+
|
61 |
+
example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
|
62 |
+
example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
|
63 |
+
else:
|
64 |
+
raise ValueError(
|
65 |
+
f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
|
66 |
+
)
|
67 |
+
elif task == "dpo":
|
68 |
+
if all(k in example.keys() for k in ("chosen", "rejected")):
|
69 |
+
# For DPO, the inputs are triples of (prompt, chosen, rejected), where `chosen` and `rejected` are the final turn of a dialogue
|
70 |
+
# We therefore need to extract the N-1 turns to form the prompt
|
71 |
+
prompt_messages = example["chosen"][:-1]
|
72 |
+
# Prepend a system message if the first message is not a system message
|
73 |
+
if example["chosen"][0]["role"] != "system":
|
74 |
+
prompt_messages.insert(0, {"role": "system", "content": ""})
|
75 |
+
# Now we extract the final turn to define chosen/rejected responses
|
76 |
+
chosen_messages = example["chosen"][-1:]
|
77 |
+
rejected_messages = example["rejected"][-1:]
|
78 |
+
example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
|
79 |
+
example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
|
80 |
+
example["text_prompt"] = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
|
81 |
+
else:
|
82 |
+
raise ValueError(
|
83 |
+
f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
|
84 |
+
)
|
85 |
+
else:
|
86 |
+
raise ValueError(
|
87 |
+
f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
|
88 |
+
)
|
89 |
+
return example
|
90 |
+
|
91 |
+
|
92 |
+
def get_datasets(
|
93 |
+
data_config: DataArguments | dict,
|
94 |
+
splits: List[str] = ["train", "test"],
|
95 |
+
shuffle: bool = True,
|
96 |
+
) -> DatasetDict:
|
97 |
+
"""
|
98 |
+
Loads one or more datasets with varying training set proportions.
|
99 |
+
|
100 |
+
Args:
|
101 |
+
data_config (`DataArguments` or `dict`):
|
102 |
+
Dataset configuration and split proportions.
|
103 |
+
splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
|
104 |
+
Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
|
105 |
+
shuffle (`bool`, *optional*, defaults to `True`):
|
106 |
+
Whether to shuffle the training and testing/validation data.
|
107 |
+
|
108 |
+
Returns
|
109 |
+
[`DatasetDict`]: The dataset dictionary containing the loaded datasets.
|
110 |
+
"""
|
111 |
+
|
112 |
+
if type(data_config) is DataArguments:
|
113 |
+
# Structure of the config to read the datasets and their mix
|
114 |
+
# datasets_mixer:
|
115 |
+
# - 'dataset1': 0.5
|
116 |
+
# - 'dataset2': 0.3
|
117 |
+
# - 'dataset3': 0.2
|
118 |
+
dataset_mixer = data_config.dataset_mixer
|
119 |
+
elif isinstance(data_config, dict):
|
120 |
+
# Structure of the input is:
|
121 |
+
# dataset_mixer = {
|
122 |
+
# "dataset1": 0.5,
|
123 |
+
# "dataset1": 0.3,
|
124 |
+
# "dataset1": 0.2,
|
125 |
+
# }
|
126 |
+
dataset_mixer = data_config
|
127 |
+
else:
|
128 |
+
raise ValueError(f"Data config {data_config} not recognized.")
|
129 |
+
|
130 |
+
raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
|
131 |
+
return raw_datasets
|
132 |
+
|
133 |
+
|
134 |
+
def mix_datasets(dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True) -> DatasetDict:
|
135 |
+
"""
|
136 |
+
Loads and mixes datasets according to proportions specified in `dataset_mixer`.
|
137 |
+
|
138 |
+
Args:
|
139 |
+
dataset_mixer (`dict`):
|
140 |
+
Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
|
141 |
+
splits (Optional[List[str]], *optional*, defaults to `None`):
|
142 |
+
Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
|
143 |
+
shuffle (`bool`, *optional*, defaults to `True`):
|
144 |
+
Whether to shuffle the training and testing/validation data.
|
145 |
+
"""
|
146 |
+
raw_datasets = DatasetDict()
|
147 |
+
raw_train_datasets = []
|
148 |
+
raw_val_datasets = []
|
149 |
+
fracs = []
|
150 |
+
for ds, frac in dataset_mixer.items():
|
151 |
+
fracs.append(frac)
|
152 |
+
for idx, split in enumerate(splits):
|
153 |
+
try:
|
154 |
+
# Try first if dataset on a Hub repo
|
155 |
+
dataset = load_dataset(ds, split=split)
|
156 |
+
except DatasetGenerationError:
|
157 |
+
# If not, check local dataset
|
158 |
+
dataset = load_from_disk(os.path.join(ds, split))
|
159 |
+
|
160 |
+
if idx == 0:
|
161 |
+
raw_train_datasets.append(dataset)
|
162 |
+
else:
|
163 |
+
raw_val_datasets.append(dataset)
|
164 |
+
|
165 |
+
if any(frac < 0 for frac in fracs):
|
166 |
+
raise ValueError("Dataset fractions cannot be negative.")
|
167 |
+
|
168 |
+
if len(raw_train_datasets) > 0:
|
169 |
+
train_subsets = []
|
170 |
+
for dataset, frac in zip(raw_train_datasets, fracs):
|
171 |
+
train_subset = dataset.select(range(int(frac * len(dataset))))
|
172 |
+
train_subsets.append(train_subset)
|
173 |
+
if shuffle:
|
174 |
+
raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
|
175 |
+
else:
|
176 |
+
raw_datasets["train"] = concatenate_datasets(train_subsets)
|
177 |
+
# No subsampling for test datasets to enable fair comparison across models
|
178 |
+
if len(raw_val_datasets) > 0:
|
179 |
+
if shuffle:
|
180 |
+
raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
|
181 |
+
else:
|
182 |
+
raw_datasets["test"] = concatenate_datasets(raw_val_datasets)
|
183 |
+
|
184 |
+
if len(raw_datasets) == 0:
|
185 |
+
raise ValueError(
|
186 |
+
f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
|
187 |
+
)
|
188 |
+
|
189 |
+
return raw_datasets
|
190 |
+
|
alignment/model_utils.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
import os
|
16 |
+
from pathlib import Path
|
17 |
+
from typing import Dict
|
18 |
+
|
19 |
+
import torch
|
20 |
+
from transformers import AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer
|
21 |
+
from transformers.trainer_utils import get_last_checkpoint
|
22 |
+
|
23 |
+
from accelerate import Accelerator
|
24 |
+
from huggingface_hub import list_repo_files
|
25 |
+
from huggingface_hub.utils._validators import HFValidationError
|
26 |
+
from peft import LoraConfig, PeftConfig
|
27 |
+
|
28 |
+
from .configs import DataArguments, DPOConfig, ModelArguments, SFTConfig
|
29 |
+
from .data import DEFAULT_CHAT_TEMPLATE
|
30 |
+
|
31 |
+
|
32 |
+
def get_current_device() -> int:
|
33 |
+
"""Get the current device. For GPU we return the local process index to enable multiple GPU training."""
|
34 |
+
return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
|
35 |
+
|
36 |
+
|
37 |
+
def get_kbit_device_map() -> Dict[str, int] | None:
|
38 |
+
"""Useful for running inference with quantized models by setting `device_map=get_peft_device_map()`"""
|
39 |
+
return {"": get_current_device()} if torch.cuda.is_available() else None
|
40 |
+
|
41 |
+
|
42 |
+
def get_quantization_config(model_args: ModelArguments) -> BitsAndBytesConfig | None:
|
43 |
+
if model_args.load_in_4bit:
|
44 |
+
compute_dtype = torch.float16
|
45 |
+
if model_args.torch_dtype not in {"auto", None}:
|
46 |
+
compute_dtype = getattr(torch, model_args.torch_dtype)
|
47 |
+
|
48 |
+
quantization_config = BitsAndBytesConfig(
|
49 |
+
load_in_4bit=True,
|
50 |
+
bnb_4bit_compute_dtype=compute_dtype,
|
51 |
+
bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
|
52 |
+
bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
|
53 |
+
)
|
54 |
+
elif model_args.load_in_8bit:
|
55 |
+
quantization_config = BitsAndBytesConfig(
|
56 |
+
load_in_8bit=True,
|
57 |
+
)
|
58 |
+
else:
|
59 |
+
quantization_config = None
|
60 |
+
|
61 |
+
return quantization_config
|
62 |
+
|
63 |
+
|
64 |
+
def get_tokenizer(model_args: ModelArguments, data_args: DataArguments) -> PreTrainedTokenizer:
|
65 |
+
"""Get the tokenizer for the model."""
|
66 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
67 |
+
model_args.model_name_or_path,
|
68 |
+
revision=model_args.model_revision,
|
69 |
+
)
|
70 |
+
if tokenizer.pad_token_id is None:
|
71 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
72 |
+
|
73 |
+
if data_args.truncation_side is not None:
|
74 |
+
tokenizer.truncation_side = data_args.truncation_side
|
75 |
+
|
76 |
+
# Set reasonable default for models without max length
|
77 |
+
if tokenizer.model_max_length > 100_000:
|
78 |
+
tokenizer.model_max_length = 2048
|
79 |
+
|
80 |
+
if data_args.chat_template is not None:
|
81 |
+
tokenizer.chat_template = data_args.chat_template
|
82 |
+
elif tokenizer.chat_template is None and tokenizer.default_chat_template is None:
|
83 |
+
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
|
84 |
+
|
85 |
+
return tokenizer
|
86 |
+
|
87 |
+
|
88 |
+
def get_peft_config(model_args: ModelArguments) -> PeftConfig | None:
|
89 |
+
if model_args.use_peft is False:
|
90 |
+
return None
|
91 |
+
|
92 |
+
peft_config = LoraConfig(
|
93 |
+
r=model_args.lora_r,
|
94 |
+
lora_alpha=model_args.lora_alpha,
|
95 |
+
lora_dropout=model_args.lora_dropout,
|
96 |
+
bias="none",
|
97 |
+
task_type="CAUSAL_LM",
|
98 |
+
target_modules=model_args.lora_target_modules,
|
99 |
+
modules_to_save=model_args.lora_modules_to_save,
|
100 |
+
)
|
101 |
+
|
102 |
+
return peft_config
|
103 |
+
|
104 |
+
|
105 |
+
def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
|
106 |
+
try:
|
107 |
+
# Try first if model on a Hub repo
|
108 |
+
repo_files = list_repo_files(model_name_or_path, revision=revision)
|
109 |
+
except HFValidationError:
|
110 |
+
# If not, check local repo
|
111 |
+
repo_files = os.listdir(model_name_or_path)
|
112 |
+
return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files
|
113 |
+
|
114 |
+
|
115 |
+
def get_checkpoint(training_args: SFTConfig | DPOConfig) -> Path | None:
|
116 |
+
last_checkpoint = None
|
117 |
+
if os.path.isdir(training_args.output_dir):
|
118 |
+
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
119 |
+
return last_checkpoint
|
alignment/release.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
|
16 |
+
import argparse
|
17 |
+
import re
|
18 |
+
|
19 |
+
import packaging.version
|
20 |
+
|
21 |
+
|
22 |
+
REPLACE_PATTERNS = {
|
23 |
+
"init": (re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE), '__version__ = "VERSION"\n'),
|
24 |
+
"setup": (re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE), r'\1version="VERSION",'),
|
25 |
+
}
|
26 |
+
REPLACE_FILES = {
|
27 |
+
"init": "src/alignment/__init__.py",
|
28 |
+
"setup": "setup.py",
|
29 |
+
}
|
30 |
+
README_FILE = "README.md"
|
31 |
+
|
32 |
+
|
33 |
+
def update_version_in_file(fname, version, pattern):
|
34 |
+
"""Update the version in one file using a specific pattern."""
|
35 |
+
with open(fname, "r", encoding="utf-8", newline="\n") as f:
|
36 |
+
code = f.read()
|
37 |
+
re_pattern, replace = REPLACE_PATTERNS[pattern]
|
38 |
+
replace = replace.replace("VERSION", version)
|
39 |
+
code = re_pattern.sub(replace, code)
|
40 |
+
with open(fname, "w", encoding="utf-8", newline="\n") as f:
|
41 |
+
f.write(code)
|
42 |
+
|
43 |
+
|
44 |
+
def global_version_update(version, patch=False):
|
45 |
+
"""Update the version in all needed files."""
|
46 |
+
for pattern, fname in REPLACE_FILES.items():
|
47 |
+
update_version_in_file(fname, version, pattern)
|
48 |
+
|
49 |
+
|
50 |
+
def get_version():
|
51 |
+
"""Reads the current version in the __init__."""
|
52 |
+
with open(REPLACE_FILES["init"], "r") as f:
|
53 |
+
code = f.read()
|
54 |
+
default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
|
55 |
+
return packaging.version.parse(default_version)
|
56 |
+
|
57 |
+
|
58 |
+
def pre_release_work(patch=False):
|
59 |
+
"""Do all the necessary pre-release steps."""
|
60 |
+
# First let's get the default version: base version if we are in dev, bump minor otherwise.
|
61 |
+
default_version = get_version()
|
62 |
+
if patch and default_version.is_devrelease:
|
63 |
+
raise ValueError("Can't create a patch version from the dev branch, checkout a released version!")
|
64 |
+
if default_version.is_devrelease:
|
65 |
+
default_version = default_version.base_version
|
66 |
+
elif patch:
|
67 |
+
default_version = f"{default_version.major}.{default_version.minor}.{default_version.micro + 1}"
|
68 |
+
else:
|
69 |
+
default_version = f"{default_version.major}.{default_version.minor + 1}.0"
|
70 |
+
|
71 |
+
# Now let's ask nicely if that's the right one.
|
72 |
+
version = input(f"Which version are you releasing? [{default_version}]")
|
73 |
+
if len(version) == 0:
|
74 |
+
version = default_version
|
75 |
+
|
76 |
+
print(f"Updating version to {version}.")
|
77 |
+
global_version_update(version, patch=patch)
|
78 |
+
|
79 |
+
|
80 |
+
def post_release_work():
|
81 |
+
"""Do all the necessary post-release steps."""
|
82 |
+
# First let's get the current version
|
83 |
+
current_version = get_version()
|
84 |
+
dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0"
|
85 |
+
current_version = current_version.base_version
|
86 |
+
|
87 |
+
# Check with the user we got that right.
|
88 |
+
version = input(f"Which version are we developing now? [{dev_version}]")
|
89 |
+
if len(version) == 0:
|
90 |
+
version = dev_version
|
91 |
+
|
92 |
+
print(f"Updating version to {version}.")
|
93 |
+
global_version_update(version)
|
94 |
+
|
95 |
+
|
96 |
+
if __name__ == "__main__":
|
97 |
+
parser = argparse.ArgumentParser()
|
98 |
+
parser.add_argument("--post_release", action="store_true", help="Whether this is pre or post release.")
|
99 |
+
parser.add_argument("--patch", action="store_true", help="Whether or not this is a patch release.")
|
100 |
+
args = parser.parse_args()
|
101 |
+
if not args.post_release:
|
102 |
+
pre_release_work(patch=args.patch)
|
103 |
+
elif args.patch:
|
104 |
+
print("Nothing to do after a patch :-)")
|
105 |
+
else:
|
106 |
+
post_release_work()
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "sanchit-gandhi/Mistral-7B-v0.1-6-layer",
|
3 |
+
"architectures": [
|
4 |
+
"MistralForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 1,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "silu",
|
10 |
+
"hidden_size": 4096,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 14336,
|
13 |
+
"max_position_embeddings": 32768,
|
14 |
+
"model_type": "mistral",
|
15 |
+
"num_attention_heads": 32,
|
16 |
+
"num_hidden_layers": 6,
|
17 |
+
"num_key_value_heads": 8,
|
18 |
+
"rms_norm_eps": 1e-05,
|
19 |
+
"rope_theta": 10000.0,
|
20 |
+
"sliding_window": 4096,
|
21 |
+
"tie_word_embeddings": false,
|
22 |
+
"torch_dtype": "bfloat16",
|
23 |
+
"transformers_version": "4.40.0.dev0",
|
24 |
+
"use_cache": false,
|
25 |
+
"vocab_size": 32000
|
26 |
+
}
|
config_full.yaml
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model arguments
|
2 |
+
model_name_or_path: sanchit-gandhi/Mistral-7B-v0.1-6-layer
|
3 |
+
model_revision: main
|
4 |
+
torch_dtype: bfloat16
|
5 |
+
use_flash_attention_2: false # torch sdpa sufficient
|
6 |
+
|
7 |
+
# Data training arguments
|
8 |
+
chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
|
9 |
+
dataset_mixer:
|
10 |
+
stingning/ultrachat: 1.0
|
11 |
+
dataset_splits:
|
12 |
+
- train[1000:]
|
13 |
+
- train[:1000]
|
14 |
+
preprocessing_num_workers: 32
|
15 |
+
|
16 |
+
# SFT trainer config
|
17 |
+
bf16: true
|
18 |
+
do_eval: true
|
19 |
+
evaluation_strategy: steps
|
20 |
+
eval_steps: 5000
|
21 |
+
save_strategy: "steps"
|
22 |
+
save_total_limit: 5000
|
23 |
+
gradient_accumulation_steps: 1
|
24 |
+
gradient_checkpointing: true
|
25 |
+
gradient_checkpointing_kwargs:
|
26 |
+
use_reentrant: False
|
27 |
+
hub_strategy: every_save
|
28 |
+
learning_rate: 0.0001
|
29 |
+
log_level: info
|
30 |
+
logging_steps: 25
|
31 |
+
logging_strategy: steps
|
32 |
+
max_seq_length: 2048
|
33 |
+
max_steps: 20000
|
34 |
+
output_dir: ./
|
35 |
+
overwrite_output_dir: true
|
36 |
+
per_device_eval_batch_size: 32
|
37 |
+
per_device_train_batch_size: 32
|
38 |
+
push_to_hub: true
|
39 |
+
remove_unused_columns: true
|
40 |
+
report_to:
|
41 |
+
- tensorboard
|
42 |
+
- wandb
|
43 |
+
seed: 42
|
44 |
+
warmup_steps: 500
|
45 |
+
ddp_timeout: 7200
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d7f84b30ad1e26b72493f2e487a84b8fb077327a611d56fcd0605d78146fa822
|
3 |
+
size 3141646744
|
run_sft.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding=utf-8
|
3 |
+
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
"""
|
17 |
+
Supervised fine-tuning script for decoder language models.
|
18 |
+
"""
|
19 |
+
|
20 |
+
import logging
|
21 |
+
import random
|
22 |
+
import sys
|
23 |
+
|
24 |
+
import datasets
|
25 |
+
import torch
|
26 |
+
import transformers
|
27 |
+
from transformers import set_seed
|
28 |
+
|
29 |
+
from alignment import (
|
30 |
+
DataArguments,
|
31 |
+
H4ArgumentParser,
|
32 |
+
ModelArguments,
|
33 |
+
SFTConfig,
|
34 |
+
apply_chat_template,
|
35 |
+
get_checkpoint,
|
36 |
+
get_datasets,
|
37 |
+
get_kbit_device_map,
|
38 |
+
get_peft_config,
|
39 |
+
get_quantization_config,
|
40 |
+
get_tokenizer,
|
41 |
+
)
|
42 |
+
from trl import SFTTrainer
|
43 |
+
|
44 |
+
|
45 |
+
logger = logging.getLogger(__name__)
|
46 |
+
|
47 |
+
|
48 |
+
def main():
|
49 |
+
parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
|
50 |
+
model_args, data_args, training_args = parser.parse()
|
51 |
+
|
52 |
+
# Set seed for reproducibility
|
53 |
+
set_seed(training_args.seed)
|
54 |
+
|
55 |
+
###############
|
56 |
+
# Setup logging
|
57 |
+
###############
|
58 |
+
logging.basicConfig(
|
59 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
60 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
61 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
62 |
+
)
|
63 |
+
log_level = training_args.get_process_log_level()
|
64 |
+
logger.setLevel(log_level)
|
65 |
+
datasets.utils.logging.set_verbosity(log_level)
|
66 |
+
transformers.utils.logging.set_verbosity(log_level)
|
67 |
+
transformers.utils.logging.enable_default_handler()
|
68 |
+
transformers.utils.logging.enable_explicit_format()
|
69 |
+
|
70 |
+
# Log on each process a small summary
|
71 |
+
logger.warning(
|
72 |
+
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
73 |
+
+ f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
74 |
+
)
|
75 |
+
logger.info(f"Model parameters {model_args}")
|
76 |
+
logger.info(f"Data parameters {data_args}")
|
77 |
+
logger.info(f"Training/evaluation parameters {training_args}")
|
78 |
+
|
79 |
+
# Check for last checkpoint
|
80 |
+
last_checkpoint = get_checkpoint(training_args)
|
81 |
+
if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
|
82 |
+
logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
|
83 |
+
|
84 |
+
###############
|
85 |
+
# Load datasets
|
86 |
+
###############
|
87 |
+
raw_datasets = get_datasets(data_args, splits=data_args.dataset_splits)
|
88 |
+
logger.info(
|
89 |
+
f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
|
90 |
+
)
|
91 |
+
column_names = list(raw_datasets["train"].features)
|
92 |
+
if "messages" not in column_names:
|
93 |
+
with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
|
94 |
+
def format_messages(example):
|
95 |
+
messages = []
|
96 |
+
for idx, message in enumerate(example["data"]):
|
97 |
+
role = "user" if idx % 2 == 0 else "assistant"
|
98 |
+
messages.append({"content": message, "role": role})
|
99 |
+
example["messages"] = messages
|
100 |
+
return example
|
101 |
+
|
102 |
+
raw_datasets = raw_datasets.map(format_messages, desc="Formatting messages", num_proc=data_args.preprocessing_num_workers)
|
103 |
+
|
104 |
+
################
|
105 |
+
# Load tokenizer
|
106 |
+
################
|
107 |
+
tokenizer = get_tokenizer(model_args, data_args)
|
108 |
+
|
109 |
+
#####################
|
110 |
+
# Apply chat template
|
111 |
+
#####################
|
112 |
+
with training_args.main_process_first():
|
113 |
+
raw_datasets = raw_datasets.map(
|
114 |
+
apply_chat_template,
|
115 |
+
fn_kwargs={"tokenizer": tokenizer, "task": "sft"},
|
116 |
+
num_proc=data_args.preprocessing_num_workers,
|
117 |
+
remove_columns=column_names,
|
118 |
+
desc="Applying chat template",
|
119 |
+
)
|
120 |
+
train_dataset = raw_datasets["train"]
|
121 |
+
eval_dataset = raw_datasets["test"]
|
122 |
+
|
123 |
+
with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
|
124 |
+
for index in random.sample(range(len(raw_datasets["train"])), 3):
|
125 |
+
logger.info(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")
|
126 |
+
|
127 |
+
#######################
|
128 |
+
# Load pretrained model
|
129 |
+
#######################
|
130 |
+
logger.info("*** Load pretrained model ***")
|
131 |
+
torch_dtype = (
|
132 |
+
model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
|
133 |
+
)
|
134 |
+
quantization_config = get_quantization_config(model_args)
|
135 |
+
|
136 |
+
model_kwargs = dict(
|
137 |
+
revision=model_args.model_revision,
|
138 |
+
trust_remote_code=model_args.trust_remote_code,
|
139 |
+
use_flash_attention_2=model_args.use_flash_attention_2,
|
140 |
+
torch_dtype=torch_dtype,
|
141 |
+
use_cache=False if training_args.gradient_checkpointing else True,
|
142 |
+
device_map=get_kbit_device_map() if quantization_config is not None else None,
|
143 |
+
quantization_config=quantization_config,
|
144 |
+
)
|
145 |
+
logger.info("*** Model loaded! ***")
|
146 |
+
|
147 |
+
########################
|
148 |
+
# Initialize the Trainer
|
149 |
+
########################
|
150 |
+
trainer = SFTTrainer(
|
151 |
+
model=model_args.model_name_or_path,
|
152 |
+
model_init_kwargs=model_kwargs,
|
153 |
+
args=training_args,
|
154 |
+
train_dataset=train_dataset,
|
155 |
+
eval_dataset=eval_dataset,
|
156 |
+
dataset_text_field="text",
|
157 |
+
max_seq_length=training_args.max_seq_length,
|
158 |
+
tokenizer=tokenizer,
|
159 |
+
packing=True,
|
160 |
+
peft_config=get_peft_config(model_args),
|
161 |
+
)
|
162 |
+
|
163 |
+
###############
|
164 |
+
# Training loop
|
165 |
+
###############
|
166 |
+
logger.info("*** Train ***")
|
167 |
+
checkpoint = None
|
168 |
+
if training_args.resume_from_checkpoint is not None:
|
169 |
+
checkpoint = training_args.resume_from_checkpoint
|
170 |
+
elif last_checkpoint is not None:
|
171 |
+
checkpoint = last_checkpoint
|
172 |
+
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
173 |
+
metrics = train_result.metrics
|
174 |
+
metrics["train_samples"] = len(train_dataset)
|
175 |
+
trainer.log_metrics("train", metrics)
|
176 |
+
trainer.save_metrics("train", metrics)
|
177 |
+
trainer.save_state()
|
178 |
+
|
179 |
+
##########
|
180 |
+
# Evaluate
|
181 |
+
##########
|
182 |
+
if training_args.do_eval:
|
183 |
+
logger.info("*** Evaluate ***")
|
184 |
+
metrics = trainer.evaluate()
|
185 |
+
metrics["eval_samples"] = len(eval_dataset)
|
186 |
+
trainer.log_metrics("eval", metrics)
|
187 |
+
trainer.save_metrics("eval", metrics)
|
188 |
+
|
189 |
+
##################################
|
190 |
+
# Save model and create model card
|
191 |
+
##################################
|
192 |
+
logger.info("*** Save model ***")
|
193 |
+
trainer.save_model(training_args.output_dir)
|
194 |
+
logger.info(f"Model saved to {training_args.output_dir}")
|
195 |
+
|
196 |
+
# Save everything else on main process
|
197 |
+
kwargs = {
|
198 |
+
"finetuned_from": model_args.model_name_or_path,
|
199 |
+
"dataset": list(data_args.dataset_mixer.keys()),
|
200 |
+
"dataset_tags": list(data_args.dataset_mixer.keys()),
|
201 |
+
"tags": ["alignment-handbook"],
|
202 |
+
}
|
203 |
+
if trainer.accelerator.is_main_process:
|
204 |
+
trainer.create_model_card(**kwargs)
|
205 |
+
# Restore k,v cache for fast inference
|
206 |
+
trainer.model.config.use_cache = True
|
207 |
+
trainer.model.config.save_pretrained(training_args.output_dir)
|
208 |
+
|
209 |
+
if training_args.push_to_hub is True:
|
210 |
+
logger.info("Pushing to hub...")
|
211 |
+
trainer.push_to_hub(**kwargs)
|
212 |
+
|
213 |
+
logger.info("*** Training complete ***")
|
214 |
+
|
215 |
+
|
216 |
+
if __name__ == "__main__":
|
217 |
+
main()
|
218 |
+
|
runs/Apr24_14-23-38_ip-26-0-162-233/events.out.tfevents.1713973415.ip-26-0-162-233.1840687.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:713e8ed73c7d50dde946e1af7c24c1babc165667a442d5f8e8f3674cf32ae072
|
3 |
+
size 4886
|
runs/Apr24_16-42-31_ip-26-0-162-233/events.out.tfevents.1713977002.ip-26-0-162-233.1854033.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d99d51bb7fcd506f76273107b7f54b0a07695a2cf620317840bf9823aa458c38
|
3 |
+
size 9086
|
slurm_job.slurm
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --job-name=distil-zephyr
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
# set 24h for job wall time limit
|
5 |
+
#SBATCH --time=24:00:00
|
6 |
+
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
|
7 |
+
#SBATCH --cpus-per-task=32
|
8 |
+
#SBATCH --gres=gpu:8
|
9 |
+
#SBATCH --exclusive
|
10 |
+
#SBATCH --partition=hopper-prod
|
11 |
+
#SBATCH --output=/fsx/sanchit/alignment-logs/%x-%j.out
|
12 |
+
|
13 |
+
set -x -e
|
14 |
+
|
15 |
+
# START EDIT
|
16 |
+
source ~/.bashrc
|
17 |
+
source /fsx/sanchit/miniconda3/bin/activate alignment
|
18 |
+
|
19 |
+
LOG_PATH="/fsx/sanchit/alignment-logs/main_log.txt"
|
20 |
+
SAVE_DIR="/fsx/sanchit"
|
21 |
+
# END EDIT
|
22 |
+
|
23 |
+
echo "START TIME: $(date)"
|
24 |
+
|
25 |
+
GPUS_PER_NODE=8
|
26 |
+
NNODES=$SLURM_NNODES
|
27 |
+
|
28 |
+
# so processes know who to talk to
|
29 |
+
MASTER_ADDR=`scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1`
|
30 |
+
|
31 |
+
# From https://i.hsfzxjy.site/2021-03-10-obtain-a-random-unused-tcp-port-with-bash/
|
32 |
+
function unused_port() {
|
33 |
+
N=${1:-1}
|
34 |
+
comm -23 \
|
35 |
+
<(seq "1025" "65535" | sort) \
|
36 |
+
<(ss -Htan |
|
37 |
+
awk '{print $4}' |
|
38 |
+
cut -d':' -f2 |
|
39 |
+
sort -u) |
|
40 |
+
shuf |
|
41 |
+
head -n "$N"
|
42 |
+
}
|
43 |
+
MASTER_PORT=$(unused_port)
|
44 |
+
|
45 |
+
# export TORCH_CPP_LOG_LEVEL=INFO
|
46 |
+
# export TORCH_DISTRIBUTED_DEBUG=DETAIL
|
47 |
+
|
48 |
+
export LAUNCHER="python -u -m accelerate.commands.launch --config_file ./accelerate_config.yaml"
|
49 |
+
|
50 |
+
export PROGRAM="./run_sft.py ./config_full.yaml"
|
51 |
+
export CMD="$LAUNCHER $PROGRAM"
|
52 |
+
echo $CMD
|
53 |
+
|
54 |
+
SRUN_ARGS=" \
|
55 |
+
--wait=60 \
|
56 |
+
--kill-on-bad-exit=1 \
|
57 |
+
"
|
58 |
+
|
59 |
+
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
|
60 |
+
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$CMD" 2>&1 | tee -a $SAVE_DIR/logs/main_log.txt
|
61 |
+
|
62 |
+
|
63 |
+
# srun error handling:
|
64 |
+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
|
65 |
+
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
|
66 |
+
|
67 |
+
# SRUN_ARGS=" \
|
68 |
+
# --wait=60 \
|
69 |
+
# --kill-on-bad-exit=1 \
|
70 |
+
# "
|
71 |
+
#
|
72 |
+
# # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
|
73 |
+
# clear; srun $SRUN_ARGS --jobid $SLURM_JOBID bash -c "$CMD" 2>&1 | tee -a $SAVE_DIR/logs/main_log.txt
|
74 |
+
|
75 |
+
echo "END TIME: $(date)"
|
76 |
+
|
special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "</s>",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<unk>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
|
3 |
+
size 493443
|
tokenizer_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"additional_special_tokens": [],
|
31 |
+
"bos_token": "<s>",
|
32 |
+
"chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
|
33 |
+
"clean_up_tokenization_spaces": false,
|
34 |
+
"eos_token": "</s>",
|
35 |
+
"legacy": true,
|
36 |
+
"model_max_length": 2048,
|
37 |
+
"pad_token": "</s>",
|
38 |
+
"sp_model_kwargs": {},
|
39 |
+
"spaces_between_special_tokens": false,
|
40 |
+
"tokenizer_class": "LlamaTokenizer",
|
41 |
+
"unk_token": "<unk>",
|
42 |
+
"use_default_system_prompt": false
|
43 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1464ab5521091ef44c1647b6866ecc70515e4a2469ed5b7ed407275c3c551c0d
|
3 |
+
size 4984
|
wandb/debug-cli.sanchit.log
ADDED
File without changes
|
wandb/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-04-24 16:43:24,533 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
|
2 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Configure stats pid to 1854033
|
3 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
|
4 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
|
5 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
|
6 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
|
8 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug.log
|
9 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log
|
10 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():564] calling init triggers
|
11 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
|
12 |
+
config: {}
|
13 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():614] starting backend
|
14 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():618] setting up manager
|
15 |
+
2024-04-24 16:43:24,537 INFO MainThread:1854033 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-04-24 16:43:24,541 INFO MainThread:1854033 [wandb_init.py:init():624] backend started and connected
|
17 |
+
2024-04-24 16:43:24,544 INFO MainThread:1854033 [wandb_init.py:init():716] updated telemetry
|
18 |
+
2024-04-24 16:43:24,569 INFO MainThread:1854033 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-04-24 16:43:24,850 INFO MainThread:1854033 [wandb_run.py:_on_init():2254] communicating current version
|
20 |
+
2024-04-24 16:43:24,896 INFO MainThread:1854033 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-04-24 16:43:24,896 INFO MainThread:1854033 [wandb_init.py:init():800] starting run threads in backend
|
23 |
+
2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_console_start():2233] atexit reg
|
24 |
+
2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2088] redirect: wrap_raw
|
25 |
+
2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2153] Wrapping output streams.
|
26 |
+
2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2178] Redirects installed.
|
27 |
+
2024-04-24 16:43:30,533 INFO MainThread:1854033 [wandb_init.py:init():841] run started, returning control to user process
|
28 |
+
2024-04-24 16:43:30,535 INFO MainThread:1854033 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_16-42-31_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
|
wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: venv
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- nvidia
|
5 |
+
- defaults
|
6 |
+
dependencies:
|
7 |
+
- _libgcc_mutex=0.1=main
|
8 |
+
- _openmp_mutex=5.1=1_gnu
|
9 |
+
- blas=1.0=mkl
|
10 |
+
- brotli-python=1.0.9=py311h6a678d5_7
|
11 |
+
- bzip2=1.0.8=h7b6447c_0
|
12 |
+
- ca-certificates=2023.12.12=h06a4308_0
|
13 |
+
- certifi=2023.11.17=py311h06a4308_0
|
14 |
+
- cffi=1.16.0=py311h5eee18b_0
|
15 |
+
- cryptography=41.0.7=py311hdda0065_0
|
16 |
+
- cuda-cudart=12.1.105=0
|
17 |
+
- cuda-cupti=12.1.105=0
|
18 |
+
- cuda-libraries=12.1.0=0
|
19 |
+
- cuda-nvrtc=12.1.105=0
|
20 |
+
- cuda-nvtx=12.1.105=0
|
21 |
+
- cuda-opencl=12.3.101=0
|
22 |
+
- cuda-runtime=12.1.0=0
|
23 |
+
- ffmpeg=4.3=hf484d3e_0
|
24 |
+
- filelock=3.13.1=py311h06a4308_0
|
25 |
+
- freetype=2.12.1=h4a9f257_0
|
26 |
+
- giflib=5.2.1=h5eee18b_3
|
27 |
+
- gmp=6.2.1=h295c915_3
|
28 |
+
- gmpy2=2.1.2=py311hc9b5ff0_0
|
29 |
+
- gnutls=3.6.15=he1e5248_0
|
30 |
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
31 |
+
- jinja2=3.1.2=py311h06a4308_0
|
32 |
+
- jpeg=9e=h5eee18b_1
|
33 |
+
- lame=3.100=h7b6447c_0
|
34 |
+
- lcms2=2.12=h3be6417_0
|
35 |
+
- ld_impl_linux-64=2.38=h1181459_1
|
36 |
+
- lerc=3.0=h295c915_0
|
37 |
+
- libcublas=12.1.0.26=0
|
38 |
+
- libcufft=11.0.2.4=0
|
39 |
+
- libcufile=1.8.1.2=0
|
40 |
+
- libcurand=10.3.4.101=0
|
41 |
+
- libcusolver=11.4.4.55=0
|
42 |
+
- libcusparse=12.0.2.55=0
|
43 |
+
- libdeflate=1.17=h5eee18b_1
|
44 |
+
- libffi=3.4.4=h6a678d5_0
|
45 |
+
- libgcc-ng=11.2.0=h1234567_1
|
46 |
+
- libgomp=11.2.0=h1234567_1
|
47 |
+
- libiconv=1.16=h7f8727e_2
|
48 |
+
- libidn2=2.3.4=h5eee18b_0
|
49 |
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
50 |
+
- libnpp=12.0.2.50=0
|
51 |
+
- libnvjitlink=12.1.105=0
|
52 |
+
- libnvjpeg=12.1.1.14=0
|
53 |
+
- libpng=1.6.39=h5eee18b_0
|
54 |
+
- libstdcxx-ng=11.2.0=h1234567_1
|
55 |
+
- libtasn1=4.19.0=h5eee18b_0
|
56 |
+
- libtiff=4.5.1=h6a678d5_0
|
57 |
+
- libunistring=0.9.10=h27cfd23_0
|
58 |
+
- libuuid=1.41.5=h5eee18b_0
|
59 |
+
- libwebp=1.3.2=h11a3e52_0
|
60 |
+
- libwebp-base=1.3.2=h5eee18b_0
|
61 |
+
- llvm-openmp=14.0.6=h9e868ea_0
|
62 |
+
- lz4-c=1.9.4=h6a678d5_0
|
63 |
+
- markupsafe=2.1.1=py311h5eee18b_0
|
64 |
+
- mkl=2023.1.0=h213fc3f_46344
|
65 |
+
- mkl-service=2.4.0=py311h5eee18b_1
|
66 |
+
- mkl_fft=1.3.8=py311h5eee18b_0
|
67 |
+
- mkl_random=1.2.4=py311hdb19cb5_0
|
68 |
+
- mpc=1.1.0=h10f8cd9_1
|
69 |
+
- mpfr=4.0.2=hb69a4c5_1
|
70 |
+
- mpmath=1.3.0=py311h06a4308_0
|
71 |
+
- ncurses=6.4=h6a678d5_0
|
72 |
+
- nettle=3.7.3=hbbd107a_1
|
73 |
+
- networkx=3.1=py311h06a4308_0
|
74 |
+
- numpy=1.26.2=py311h08b1b3b_0
|
75 |
+
- numpy-base=1.26.2=py311hf175353_0
|
76 |
+
- openh264=2.1.1=h4ff587b_0
|
77 |
+
- openjpeg=2.4.0=h3ad879b_0
|
78 |
+
- openssl=3.0.12=h7f8727e_0
|
79 |
+
- pycparser=2.21=pyhd3eb1b0_0
|
80 |
+
- pyopenssl=23.2.0=py311h06a4308_0
|
81 |
+
- pysocks=1.7.1=py311h06a4308_0
|
82 |
+
- python=3.11.5=h955ad1f_0
|
83 |
+
- pytorch-cuda=12.1=ha16c6d3_5
|
84 |
+
- pytorch-mutex=1.0=cuda
|
85 |
+
- pyyaml=6.0.1=py311h5eee18b_0
|
86 |
+
- readline=8.2=h5eee18b_0
|
87 |
+
- requests=2.31.0=py311h06a4308_0
|
88 |
+
- setuptools=68.2.2=py311h06a4308_0
|
89 |
+
- sqlite=3.41.2=h5eee18b_0
|
90 |
+
- sympy=1.12=py311h06a4308_0
|
91 |
+
- tbb=2021.8.0=hdb19cb5_0
|
92 |
+
- tk=8.6.12=h1ccaba5_0
|
93 |
+
- wheel=0.41.2=py311h06a4308_0
|
94 |
+
- xz=5.4.5=h5eee18b_0
|
95 |
+
- yaml=0.2.5=h7b6447c_0
|
96 |
+
- zlib=1.2.13=h5eee18b_0
|
97 |
+
- zstd=1.5.5=hc292b87_0
|
98 |
+
- pip:
|
99 |
+
- absl-py==2.0.0
|
100 |
+
- accelerate==0.29.3
|
101 |
+
- aiohttp==3.9.1
|
102 |
+
- aiosignal==1.3.1
|
103 |
+
- annotated-types==0.6.0
|
104 |
+
- anyio==4.2.0
|
105 |
+
- appdirs==1.4.4
|
106 |
+
- argon2-cffi==23.1.0
|
107 |
+
- argon2-cffi-bindings==21.2.0
|
108 |
+
- arrow==1.3.0
|
109 |
+
- asttokens==2.4.1
|
110 |
+
- astunparse==1.6.3
|
111 |
+
- async-lru==2.0.4
|
112 |
+
- attrs==23.1.0
|
113 |
+
- audioread==3.0.1
|
114 |
+
- babel==2.14.0
|
115 |
+
- beautifulsoup4==4.12.3
|
116 |
+
- bitsandbytes==0.43.1
|
117 |
+
- bleach==6.1.0
|
118 |
+
- cachetools==5.3.2
|
119 |
+
- chardet==5.2.0
|
120 |
+
- charset-normalizer==3.3.2
|
121 |
+
- click==8.1.7
|
122 |
+
- comm==0.2.1
|
123 |
+
- datasets==2.18.1.dev0
|
124 |
+
- debugpy==1.8.1
|
125 |
+
- decorator==5.1.1
|
126 |
+
- deepspeed==0.12.2
|
127 |
+
- defusedxml==0.7.1
|
128 |
+
- dill==0.3.7
|
129 |
+
- docker-pycreds==0.4.0
|
130 |
+
- docstring-parser==0.15
|
131 |
+
- einops==0.7.0
|
132 |
+
- evaluate==0.4.0
|
133 |
+
- executing==2.0.1
|
134 |
+
- fastjsonschema==2.19.1
|
135 |
+
- flatbuffers==23.5.26
|
136 |
+
- fqdn==1.5.1
|
137 |
+
- frozenlist==1.4.1
|
138 |
+
- fsspec==2023.10.0
|
139 |
+
- gast==0.5.4
|
140 |
+
- gitdb==4.0.11
|
141 |
+
- gitpython==3.1.40
|
142 |
+
- google-auth==2.26.1
|
143 |
+
- google-auth-oauthlib==1.2.0
|
144 |
+
- google-pasta==0.2.0
|
145 |
+
- grpcio==1.60.0
|
146 |
+
- h11==0.14.0
|
147 |
+
- h5py==3.10.0
|
148 |
+
- hf-transfer==0.1.5
|
149 |
+
- hjson==3.1.0
|
150 |
+
- httpcore==1.0.2
|
151 |
+
- httpx==0.26.0
|
152 |
+
- huggingface-hub==0.22.2
|
153 |
+
- idna==3.6
|
154 |
+
- ipdb==0.13.13
|
155 |
+
- ipykernel==6.29.2
|
156 |
+
- ipython==8.21.0
|
157 |
+
- isoduration==20.11.0
|
158 |
+
- jedi==0.19.1
|
159 |
+
- jiwer==3.0.3
|
160 |
+
- joblib==1.3.2
|
161 |
+
- json5==0.9.14
|
162 |
+
- jsonpointer==2.4
|
163 |
+
- jsonschema==4.21.1
|
164 |
+
- jsonschema-specifications==2023.12.1
|
165 |
+
- jupyter-client==8.6.0
|
166 |
+
- jupyter-core==5.7.1
|
167 |
+
- jupyter-events==0.9.0
|
168 |
+
- jupyter-lsp==2.2.2
|
169 |
+
- jupyter-server==2.12.5
|
170 |
+
- jupyter-server-terminals==0.5.2
|
171 |
+
- jupyterlab==4.1.1
|
172 |
+
- jupyterlab-pygments==0.3.0
|
173 |
+
- jupyterlab-server==2.25.2
|
174 |
+
- keras==2.15.0
|
175 |
+
- lazy-loader==0.3
|
176 |
+
- libclang==16.0.6
|
177 |
+
- librosa==0.10.1
|
178 |
+
- llvmlite==0.41.1
|
179 |
+
- markdown==3.5.1
|
180 |
+
- markdown-it-py==3.0.0
|
181 |
+
- matplotlib-inline==0.1.6
|
182 |
+
- mdurl==0.1.2
|
183 |
+
- mistune==3.0.2
|
184 |
+
- ml-dtypes==0.2.0
|
185 |
+
- msgpack==1.0.7
|
186 |
+
- multidict==6.0.4
|
187 |
+
- multiprocess==0.70.15
|
188 |
+
- nbclient==0.9.0
|
189 |
+
- nbconvert==7.16.0
|
190 |
+
- nbformat==5.9.2
|
191 |
+
- nest-asyncio==1.6.0
|
192 |
+
- ninja==1.11.1.1
|
193 |
+
- nltk==3.8.1
|
194 |
+
- notebook-shim==0.2.3
|
195 |
+
- numba==0.58.1
|
196 |
+
- nvidia-cublas-cu12==12.1.3.1
|
197 |
+
- nvidia-cuda-cupti-cu12==12.1.105
|
198 |
+
- nvidia-cuda-nvrtc-cu12==12.1.105
|
199 |
+
- nvidia-cuda-runtime-cu12==12.1.105
|
200 |
+
- nvidia-cudnn-cu12==8.9.2.26
|
201 |
+
- nvidia-cufft-cu12==11.0.2.54
|
202 |
+
- nvidia-curand-cu12==10.3.2.106
|
203 |
+
- nvidia-cusolver-cu12==11.4.5.107
|
204 |
+
- nvidia-cusparse-cu12==12.1.0.106
|
205 |
+
- nvidia-nccl-cu12==2.20.5
|
206 |
+
- nvidia-nvjitlink-cu12==12.3.101
|
207 |
+
- nvidia-nvtx-cu12==12.1.105
|
208 |
+
- oauthlib==3.2.2
|
209 |
+
- opt-einsum==3.3.0
|
210 |
+
- overrides==7.7.0
|
211 |
+
- packaging==23.2
|
212 |
+
- pandas==2.1.4
|
213 |
+
- pandocfilters==1.5.1
|
214 |
+
- parso==0.8.3
|
215 |
+
- peft==0.7.1
|
216 |
+
- pexpect==4.9.0
|
217 |
+
- pillow==10.2.0
|
218 |
+
- pip==24.0
|
219 |
+
- platformdirs==4.1.0
|
220 |
+
- pooch==1.8.0
|
221 |
+
- prometheus-client==0.19.0
|
222 |
+
- prompt-toolkit==3.0.43
|
223 |
+
- protobuf==3.20.2
|
224 |
+
- psutil==5.9.7
|
225 |
+
- ptyprocess==0.7.0
|
226 |
+
- pure-eval==0.2.2
|
227 |
+
- py-cpuinfo==9.0.0
|
228 |
+
- pyarrow==14.0.2
|
229 |
+
- pyarrow-hotfix==0.6
|
230 |
+
- pyasn1==0.5.1
|
231 |
+
- pyasn1-modules==0.3.0
|
232 |
+
- pydantic==2.6.0
|
233 |
+
- pydantic-core==2.16.1
|
234 |
+
- pygments==2.17.2
|
235 |
+
- pynvml==11.5.0
|
236 |
+
- python-dateutil==2.8.2
|
237 |
+
- python-json-logger==2.0.7
|
238 |
+
- pytorch-triton==3.0.0+989adb9a29
|
239 |
+
- pytz==2023.3.post1
|
240 |
+
- pyzmq==25.1.2
|
241 |
+
- rapidfuzz==3.6.1
|
242 |
+
- referencing==0.33.0
|
243 |
+
- regex==2023.12.25
|
244 |
+
- requests-oauthlib==1.3.1
|
245 |
+
- responses==0.18.0
|
246 |
+
- rfc3339-validator==0.1.4
|
247 |
+
- rfc3986-validator==0.1.1
|
248 |
+
- rich==13.7.0
|
249 |
+
- rpds-py==0.17.1
|
250 |
+
- rsa==4.9
|
251 |
+
- safetensors==0.4.1
|
252 |
+
- scikit-learn==1.3.2
|
253 |
+
- scipy==1.11.4
|
254 |
+
- send2trash==1.8.2
|
255 |
+
- sentencepiece==0.1.99
|
256 |
+
- sentry-sdk==1.39.1
|
257 |
+
- setproctitle==1.3.3
|
258 |
+
- shtab==1.6.5
|
259 |
+
- six==1.16.0
|
260 |
+
- smmap==5.0.1
|
261 |
+
- sniffio==1.3.0
|
262 |
+
- soundfile==0.12.1
|
263 |
+
- soupsieve==2.5
|
264 |
+
- soxr==0.3.7
|
265 |
+
- stack-data==0.6.3
|
266 |
+
- tensorboard==2.15.1
|
267 |
+
- tensorboard-data-server==0.7.2
|
268 |
+
- tensorflow-cpu==2.15.0.post1
|
269 |
+
- tensorflow-estimator==2.15.0
|
270 |
+
- tensorflow-io-gcs-filesystem==0.35.0
|
271 |
+
- termcolor==2.4.0
|
272 |
+
- terminado==0.18.0
|
273 |
+
- threadpoolctl==3.2.0
|
274 |
+
- tinycss2==1.2.1
|
275 |
+
- tokenizers==0.15.0
|
276 |
+
- torch==2.4.0.dev20240323+cu121
|
277 |
+
- torchaudio==2.2.0.dev20240323+cu121
|
278 |
+
- torchvision==0.19.0.dev20240323+cu121
|
279 |
+
- tornado==6.4
|
280 |
+
- tqdm==4.66.1
|
281 |
+
- traitlets==5.14.1
|
282 |
+
- transformers==4.39.0.dev0
|
283 |
+
- triton==2.2.0
|
284 |
+
- trl==0.8.6
|
285 |
+
- types-python-dateutil==2.8.19.20240106
|
286 |
+
- typing-extensions==4.10.0
|
287 |
+
- tyro==0.7.0
|
288 |
+
- tzdata==2023.3
|
289 |
+
- uri-template==1.3.0
|
290 |
+
- urllib3==2.1.0
|
291 |
+
- wandb==0.16.1
|
292 |
+
- wcwidth==0.2.13
|
293 |
+
- webcolors==1.13
|
294 |
+
- webencodings==0.5.1
|
295 |
+
- websocket-client==1.7.0
|
296 |
+
- werkzeug==3.0.1
|
297 |
+
- wrapt==1.14.1
|
298 |
+
- xxhash==3.4.1
|
299 |
+
- yarl==1.9.4
|
300 |
+
prefix: /fsx/sanchit/miniconda3/envs/venv
|
wandb/run-20240424_154339-mwp0iutr/files/config.yaml
ADDED
@@ -0,0 +1,663 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
_wandb:
|
4 |
+
desc: null
|
5 |
+
value:
|
6 |
+
python_version: 3.11.5
|
7 |
+
cli_version: 0.16.1
|
8 |
+
framework: huggingface
|
9 |
+
huggingface_version: 4.40.0.dev0
|
10 |
+
is_jupyter_run: false
|
11 |
+
is_kaggle_kernel: false
|
12 |
+
start_time: 1713973419.470656
|
13 |
+
t:
|
14 |
+
1:
|
15 |
+
- 1
|
16 |
+
- 2
|
17 |
+
- 3
|
18 |
+
- 5
|
19 |
+
- 11
|
20 |
+
- 49
|
21 |
+
- 51
|
22 |
+
- 53
|
23 |
+
- 55
|
24 |
+
- 71
|
25 |
+
- 84
|
26 |
+
- 98
|
27 |
+
2:
|
28 |
+
- 1
|
29 |
+
- 2
|
30 |
+
- 3
|
31 |
+
- 5
|
32 |
+
- 11
|
33 |
+
- 49
|
34 |
+
- 51
|
35 |
+
- 53
|
36 |
+
- 55
|
37 |
+
- 71
|
38 |
+
- 84
|
39 |
+
- 98
|
40 |
+
3:
|
41 |
+
- 7
|
42 |
+
- 23
|
43 |
+
4: 3.11.5
|
44 |
+
5: 0.16.1
|
45 |
+
6: 4.40.0.dev0
|
46 |
+
8:
|
47 |
+
- 5
|
48 |
+
9:
|
49 |
+
1: transformers_trainer
|
50 |
+
13: linux-x86_64
|
51 |
+
m:
|
52 |
+
- 1: train/global_step
|
53 |
+
6:
|
54 |
+
- 3
|
55 |
+
- 1: train/loss
|
56 |
+
5: 1
|
57 |
+
6:
|
58 |
+
- 1
|
59 |
+
- 1: train/grad_norm
|
60 |
+
5: 1
|
61 |
+
6:
|
62 |
+
- 1
|
63 |
+
- 1: train/learning_rate
|
64 |
+
5: 1
|
65 |
+
6:
|
66 |
+
- 1
|
67 |
+
- 1: train/epoch
|
68 |
+
5: 1
|
69 |
+
6:
|
70 |
+
- 1
|
71 |
+
vocab_size:
|
72 |
+
desc: null
|
73 |
+
value: 32000
|
74 |
+
max_position_embeddings:
|
75 |
+
desc: null
|
76 |
+
value: 32768
|
77 |
+
hidden_size:
|
78 |
+
desc: null
|
79 |
+
value: 4096
|
80 |
+
intermediate_size:
|
81 |
+
desc: null
|
82 |
+
value: 14336
|
83 |
+
num_hidden_layers:
|
84 |
+
desc: null
|
85 |
+
value: 6
|
86 |
+
num_attention_heads:
|
87 |
+
desc: null
|
88 |
+
value: 32
|
89 |
+
sliding_window:
|
90 |
+
desc: null
|
91 |
+
value: 4096
|
92 |
+
num_key_value_heads:
|
93 |
+
desc: null
|
94 |
+
value: 8
|
95 |
+
hidden_act:
|
96 |
+
desc: null
|
97 |
+
value: silu
|
98 |
+
initializer_range:
|
99 |
+
desc: null
|
100 |
+
value: 0.02
|
101 |
+
rms_norm_eps:
|
102 |
+
desc: null
|
103 |
+
value: 1.0e-05
|
104 |
+
use_cache:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
rope_theta:
|
108 |
+
desc: null
|
109 |
+
value: 10000.0
|
110 |
+
attention_dropout:
|
111 |
+
desc: null
|
112 |
+
value: 0.0
|
113 |
+
return_dict:
|
114 |
+
desc: null
|
115 |
+
value: true
|
116 |
+
output_hidden_states:
|
117 |
+
desc: null
|
118 |
+
value: false
|
119 |
+
output_attentions:
|
120 |
+
desc: null
|
121 |
+
value: false
|
122 |
+
torchscript:
|
123 |
+
desc: null
|
124 |
+
value: false
|
125 |
+
torch_dtype:
|
126 |
+
desc: null
|
127 |
+
value: bfloat16
|
128 |
+
use_bfloat16:
|
129 |
+
desc: null
|
130 |
+
value: false
|
131 |
+
tf_legacy_loss:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
pruned_heads:
|
135 |
+
desc: null
|
136 |
+
value: {}
|
137 |
+
tie_word_embeddings:
|
138 |
+
desc: null
|
139 |
+
value: false
|
140 |
+
chunk_size_feed_forward:
|
141 |
+
desc: null
|
142 |
+
value: 0
|
143 |
+
is_encoder_decoder:
|
144 |
+
desc: null
|
145 |
+
value: false
|
146 |
+
is_decoder:
|
147 |
+
desc: null
|
148 |
+
value: false
|
149 |
+
cross_attention_hidden_size:
|
150 |
+
desc: null
|
151 |
+
value: null
|
152 |
+
add_cross_attention:
|
153 |
+
desc: null
|
154 |
+
value: false
|
155 |
+
tie_encoder_decoder:
|
156 |
+
desc: null
|
157 |
+
value: false
|
158 |
+
max_length:
|
159 |
+
desc: null
|
160 |
+
value: 20
|
161 |
+
min_length:
|
162 |
+
desc: null
|
163 |
+
value: 0
|
164 |
+
do_sample:
|
165 |
+
desc: null
|
166 |
+
value: false
|
167 |
+
early_stopping:
|
168 |
+
desc: null
|
169 |
+
value: false
|
170 |
+
num_beams:
|
171 |
+
desc: null
|
172 |
+
value: 1
|
173 |
+
num_beam_groups:
|
174 |
+
desc: null
|
175 |
+
value: 1
|
176 |
+
diversity_penalty:
|
177 |
+
desc: null
|
178 |
+
value: 0.0
|
179 |
+
temperature:
|
180 |
+
desc: null
|
181 |
+
value: 1.0
|
182 |
+
top_k:
|
183 |
+
desc: null
|
184 |
+
value: 50
|
185 |
+
top_p:
|
186 |
+
desc: null
|
187 |
+
value: 1.0
|
188 |
+
typical_p:
|
189 |
+
desc: null
|
190 |
+
value: 1.0
|
191 |
+
repetition_penalty:
|
192 |
+
desc: null
|
193 |
+
value: 1.0
|
194 |
+
length_penalty:
|
195 |
+
desc: null
|
196 |
+
value: 1.0
|
197 |
+
no_repeat_ngram_size:
|
198 |
+
desc: null
|
199 |
+
value: 0
|
200 |
+
encoder_no_repeat_ngram_size:
|
201 |
+
desc: null
|
202 |
+
value: 0
|
203 |
+
bad_words_ids:
|
204 |
+
desc: null
|
205 |
+
value: null
|
206 |
+
num_return_sequences:
|
207 |
+
desc: null
|
208 |
+
value: 1
|
209 |
+
output_scores:
|
210 |
+
desc: null
|
211 |
+
value: false
|
212 |
+
return_dict_in_generate:
|
213 |
+
desc: null
|
214 |
+
value: false
|
215 |
+
forced_bos_token_id:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
forced_eos_token_id:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
remove_invalid_values:
|
222 |
+
desc: null
|
223 |
+
value: false
|
224 |
+
exponential_decay_length_penalty:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
suppress_tokens:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
begin_suppress_tokens:
|
231 |
+
desc: null
|
232 |
+
value: null
|
233 |
+
architectures:
|
234 |
+
desc: null
|
235 |
+
value:
|
236 |
+
- MistralForCausalLM
|
237 |
+
finetuning_task:
|
238 |
+
desc: null
|
239 |
+
value: null
|
240 |
+
id2label:
|
241 |
+
desc: null
|
242 |
+
value:
|
243 |
+
'0': LABEL_0
|
244 |
+
'1': LABEL_1
|
245 |
+
label2id:
|
246 |
+
desc: null
|
247 |
+
value:
|
248 |
+
LABEL_0: 0
|
249 |
+
LABEL_1: 1
|
250 |
+
tokenizer_class:
|
251 |
+
desc: null
|
252 |
+
value: null
|
253 |
+
prefix:
|
254 |
+
desc: null
|
255 |
+
value: null
|
256 |
+
bos_token_id:
|
257 |
+
desc: null
|
258 |
+
value: 1
|
259 |
+
pad_token_id:
|
260 |
+
desc: null
|
261 |
+
value: null
|
262 |
+
eos_token_id:
|
263 |
+
desc: null
|
264 |
+
value: 2
|
265 |
+
sep_token_id:
|
266 |
+
desc: null
|
267 |
+
value: null
|
268 |
+
decoder_start_token_id:
|
269 |
+
desc: null
|
270 |
+
value: null
|
271 |
+
task_specific_params:
|
272 |
+
desc: null
|
273 |
+
value: null
|
274 |
+
problem_type:
|
275 |
+
desc: null
|
276 |
+
value: null
|
277 |
+
_name_or_path:
|
278 |
+
desc: null
|
279 |
+
value: sanchit-gandhi/Mistral-7B-v0.1-6-layer
|
280 |
+
transformers_version:
|
281 |
+
desc: null
|
282 |
+
value: 4.40.0.dev0
|
283 |
+
model_type:
|
284 |
+
desc: null
|
285 |
+
value: mistral
|
286 |
+
output_dir:
|
287 |
+
desc: null
|
288 |
+
value: ./
|
289 |
+
overwrite_output_dir:
|
290 |
+
desc: null
|
291 |
+
value: true
|
292 |
+
do_train:
|
293 |
+
desc: null
|
294 |
+
value: false
|
295 |
+
do_eval:
|
296 |
+
desc: null
|
297 |
+
value: true
|
298 |
+
do_predict:
|
299 |
+
desc: null
|
300 |
+
value: false
|
301 |
+
evaluation_strategy:
|
302 |
+
desc: null
|
303 |
+
value: steps
|
304 |
+
prediction_loss_only:
|
305 |
+
desc: null
|
306 |
+
value: false
|
307 |
+
per_device_train_batch_size:
|
308 |
+
desc: null
|
309 |
+
value: 64
|
310 |
+
per_device_eval_batch_size:
|
311 |
+
desc: null
|
312 |
+
value: 32
|
313 |
+
per_gpu_train_batch_size:
|
314 |
+
desc: null
|
315 |
+
value: null
|
316 |
+
per_gpu_eval_batch_size:
|
317 |
+
desc: null
|
318 |
+
value: null
|
319 |
+
gradient_accumulation_steps:
|
320 |
+
desc: null
|
321 |
+
value: 1
|
322 |
+
eval_accumulation_steps:
|
323 |
+
desc: null
|
324 |
+
value: null
|
325 |
+
eval_delay:
|
326 |
+
desc: null
|
327 |
+
value: 0
|
328 |
+
learning_rate:
|
329 |
+
desc: null
|
330 |
+
value: 0.0001
|
331 |
+
weight_decay:
|
332 |
+
desc: null
|
333 |
+
value: 0.0
|
334 |
+
adam_beta1:
|
335 |
+
desc: null
|
336 |
+
value: 0.9
|
337 |
+
adam_beta2:
|
338 |
+
desc: null
|
339 |
+
value: 0.999
|
340 |
+
adam_epsilon:
|
341 |
+
desc: null
|
342 |
+
value: 1.0e-08
|
343 |
+
max_grad_norm:
|
344 |
+
desc: null
|
345 |
+
value: 1.0
|
346 |
+
num_train_epochs:
|
347 |
+
desc: null
|
348 |
+
value: 3.0
|
349 |
+
max_steps:
|
350 |
+
desc: null
|
351 |
+
value: 20000
|
352 |
+
lr_scheduler_type:
|
353 |
+
desc: null
|
354 |
+
value: linear
|
355 |
+
lr_scheduler_kwargs:
|
356 |
+
desc: null
|
357 |
+
value: {}
|
358 |
+
warmup_ratio:
|
359 |
+
desc: null
|
360 |
+
value: 0.0
|
361 |
+
warmup_steps:
|
362 |
+
desc: null
|
363 |
+
value: 500
|
364 |
+
log_level:
|
365 |
+
desc: null
|
366 |
+
value: info
|
367 |
+
log_level_replica:
|
368 |
+
desc: null
|
369 |
+
value: warning
|
370 |
+
log_on_each_node:
|
371 |
+
desc: null
|
372 |
+
value: true
|
373 |
+
logging_dir:
|
374 |
+
desc: null
|
375 |
+
value: ./runs/Apr24_14-23-38_ip-26-0-162-233
|
376 |
+
logging_strategy:
|
377 |
+
desc: null
|
378 |
+
value: steps
|
379 |
+
logging_first_step:
|
380 |
+
desc: null
|
381 |
+
value: true
|
382 |
+
logging_steps:
|
383 |
+
desc: null
|
384 |
+
value: 25
|
385 |
+
logging_nan_inf_filter:
|
386 |
+
desc: null
|
387 |
+
value: true
|
388 |
+
save_strategy:
|
389 |
+
desc: null
|
390 |
+
value: steps
|
391 |
+
save_steps:
|
392 |
+
desc: null
|
393 |
+
value: 500
|
394 |
+
save_total_limit:
|
395 |
+
desc: null
|
396 |
+
value: 5000
|
397 |
+
save_safetensors:
|
398 |
+
desc: null
|
399 |
+
value: true
|
400 |
+
save_on_each_node:
|
401 |
+
desc: null
|
402 |
+
value: false
|
403 |
+
save_only_model:
|
404 |
+
desc: null
|
405 |
+
value: false
|
406 |
+
no_cuda:
|
407 |
+
desc: null
|
408 |
+
value: false
|
409 |
+
use_cpu:
|
410 |
+
desc: null
|
411 |
+
value: false
|
412 |
+
use_mps_device:
|
413 |
+
desc: null
|
414 |
+
value: false
|
415 |
+
seed:
|
416 |
+
desc: null
|
417 |
+
value: 42
|
418 |
+
data_seed:
|
419 |
+
desc: null
|
420 |
+
value: null
|
421 |
+
jit_mode_eval:
|
422 |
+
desc: null
|
423 |
+
value: false
|
424 |
+
use_ipex:
|
425 |
+
desc: null
|
426 |
+
value: false
|
427 |
+
bf16:
|
428 |
+
desc: null
|
429 |
+
value: true
|
430 |
+
fp16:
|
431 |
+
desc: null
|
432 |
+
value: false
|
433 |
+
fp16_opt_level:
|
434 |
+
desc: null
|
435 |
+
value: O1
|
436 |
+
half_precision_backend:
|
437 |
+
desc: null
|
438 |
+
value: auto
|
439 |
+
bf16_full_eval:
|
440 |
+
desc: null
|
441 |
+
value: false
|
442 |
+
fp16_full_eval:
|
443 |
+
desc: null
|
444 |
+
value: false
|
445 |
+
tf32:
|
446 |
+
desc: null
|
447 |
+
value: null
|
448 |
+
local_rank:
|
449 |
+
desc: null
|
450 |
+
value: 0
|
451 |
+
ddp_backend:
|
452 |
+
desc: null
|
453 |
+
value: null
|
454 |
+
tpu_num_cores:
|
455 |
+
desc: null
|
456 |
+
value: null
|
457 |
+
tpu_metrics_debug:
|
458 |
+
desc: null
|
459 |
+
value: false
|
460 |
+
debug:
|
461 |
+
desc: null
|
462 |
+
value: []
|
463 |
+
dataloader_drop_last:
|
464 |
+
desc: null
|
465 |
+
value: false
|
466 |
+
eval_steps:
|
467 |
+
desc: null
|
468 |
+
value: 5000
|
469 |
+
dataloader_num_workers:
|
470 |
+
desc: null
|
471 |
+
value: 0
|
472 |
+
dataloader_prefetch_factor:
|
473 |
+
desc: null
|
474 |
+
value: null
|
475 |
+
past_index:
|
476 |
+
desc: null
|
477 |
+
value: -1
|
478 |
+
run_name:
|
479 |
+
desc: null
|
480 |
+
value: ./
|
481 |
+
disable_tqdm:
|
482 |
+
desc: null
|
483 |
+
value: false
|
484 |
+
remove_unused_columns:
|
485 |
+
desc: null
|
486 |
+
value: true
|
487 |
+
label_names:
|
488 |
+
desc: null
|
489 |
+
value: null
|
490 |
+
load_best_model_at_end:
|
491 |
+
desc: null
|
492 |
+
value: false
|
493 |
+
metric_for_best_model:
|
494 |
+
desc: null
|
495 |
+
value: null
|
496 |
+
greater_is_better:
|
497 |
+
desc: null
|
498 |
+
value: null
|
499 |
+
ignore_data_skip:
|
500 |
+
desc: null
|
501 |
+
value: false
|
502 |
+
fsdp:
|
503 |
+
desc: null
|
504 |
+
value: []
|
505 |
+
fsdp_min_num_params:
|
506 |
+
desc: null
|
507 |
+
value: 0
|
508 |
+
fsdp_config:
|
509 |
+
desc: null
|
510 |
+
value:
|
511 |
+
min_num_params: 0
|
512 |
+
xla: false
|
513 |
+
xla_fsdp_v2: false
|
514 |
+
xla_fsdp_grad_ckpt: false
|
515 |
+
fsdp_transformer_layer_cls_to_wrap:
|
516 |
+
desc: null
|
517 |
+
value: null
|
518 |
+
accelerator_config:
|
519 |
+
desc: null
|
520 |
+
value:
|
521 |
+
split_batches: false
|
522 |
+
dispatch_batches: null
|
523 |
+
even_batches: true
|
524 |
+
use_seedable_sampler: true
|
525 |
+
gradient_accumulation_kwargs: null
|
526 |
+
deepspeed:
|
527 |
+
desc: null
|
528 |
+
value: null
|
529 |
+
label_smoothing_factor:
|
530 |
+
desc: null
|
531 |
+
value: 0.0
|
532 |
+
optim:
|
533 |
+
desc: null
|
534 |
+
value: adamw_torch
|
535 |
+
optim_args:
|
536 |
+
desc: null
|
537 |
+
value: null
|
538 |
+
adafactor:
|
539 |
+
desc: null
|
540 |
+
value: false
|
541 |
+
group_by_length:
|
542 |
+
desc: null
|
543 |
+
value: false
|
544 |
+
length_column_name:
|
545 |
+
desc: null
|
546 |
+
value: length
|
547 |
+
report_to:
|
548 |
+
desc: null
|
549 |
+
value:
|
550 |
+
- tensorboard
|
551 |
+
- wandb
|
552 |
+
ddp_find_unused_parameters:
|
553 |
+
desc: null
|
554 |
+
value: null
|
555 |
+
ddp_bucket_cap_mb:
|
556 |
+
desc: null
|
557 |
+
value: null
|
558 |
+
ddp_broadcast_buffers:
|
559 |
+
desc: null
|
560 |
+
value: null
|
561 |
+
dataloader_pin_memory:
|
562 |
+
desc: null
|
563 |
+
value: true
|
564 |
+
dataloader_persistent_workers:
|
565 |
+
desc: null
|
566 |
+
value: false
|
567 |
+
skip_memory_metrics:
|
568 |
+
desc: null
|
569 |
+
value: true
|
570 |
+
use_legacy_prediction_loop:
|
571 |
+
desc: null
|
572 |
+
value: false
|
573 |
+
push_to_hub:
|
574 |
+
desc: null
|
575 |
+
value: true
|
576 |
+
resume_from_checkpoint:
|
577 |
+
desc: null
|
578 |
+
value: null
|
579 |
+
hub_model_id:
|
580 |
+
desc: null
|
581 |
+
value: null
|
582 |
+
hub_strategy:
|
583 |
+
desc: null
|
584 |
+
value: every_save
|
585 |
+
hub_token:
|
586 |
+
desc: null
|
587 |
+
value: <HUB_TOKEN>
|
588 |
+
hub_private_repo:
|
589 |
+
desc: null
|
590 |
+
value: false
|
591 |
+
hub_always_push:
|
592 |
+
desc: null
|
593 |
+
value: false
|
594 |
+
gradient_checkpointing:
|
595 |
+
desc: null
|
596 |
+
value: true
|
597 |
+
gradient_checkpointing_kwargs:
|
598 |
+
desc: null
|
599 |
+
value:
|
600 |
+
use_reentrant: false
|
601 |
+
include_inputs_for_metrics:
|
602 |
+
desc: null
|
603 |
+
value: false
|
604 |
+
fp16_backend:
|
605 |
+
desc: null
|
606 |
+
value: auto
|
607 |
+
push_to_hub_model_id:
|
608 |
+
desc: null
|
609 |
+
value: null
|
610 |
+
push_to_hub_organization:
|
611 |
+
desc: null
|
612 |
+
value: null
|
613 |
+
push_to_hub_token:
|
614 |
+
desc: null
|
615 |
+
value: <PUSH_TO_HUB_TOKEN>
|
616 |
+
mp_parameters:
|
617 |
+
desc: null
|
618 |
+
value: ''
|
619 |
+
auto_find_batch_size:
|
620 |
+
desc: null
|
621 |
+
value: false
|
622 |
+
full_determinism:
|
623 |
+
desc: null
|
624 |
+
value: false
|
625 |
+
torchdynamo:
|
626 |
+
desc: null
|
627 |
+
value: null
|
628 |
+
ray_scope:
|
629 |
+
desc: null
|
630 |
+
value: last
|
631 |
+
ddp_timeout:
|
632 |
+
desc: null
|
633 |
+
value: 7200
|
634 |
+
torch_compile:
|
635 |
+
desc: null
|
636 |
+
value: false
|
637 |
+
torch_compile_backend:
|
638 |
+
desc: null
|
639 |
+
value: null
|
640 |
+
torch_compile_mode:
|
641 |
+
desc: null
|
642 |
+
value: null
|
643 |
+
dispatch_batches:
|
644 |
+
desc: null
|
645 |
+
value: null
|
646 |
+
split_batches:
|
647 |
+
desc: null
|
648 |
+
value: null
|
649 |
+
include_tokens_per_second:
|
650 |
+
desc: null
|
651 |
+
value: false
|
652 |
+
include_num_input_tokens_seen:
|
653 |
+
desc: null
|
654 |
+
value: false
|
655 |
+
neftune_noise_alpha:
|
656 |
+
desc: null
|
657 |
+
value: null
|
658 |
+
optim_target_modules:
|
659 |
+
desc: null
|
660 |
+
value: null
|
661 |
+
max_seq_length:
|
662 |
+
desc: null
|
663 |
+
value: 2048
|
wandb/run-20240424_154339-mwp0iutr/files/output.log
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
0%| | 0/20000 [00:00<?, ?it/s]/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
|
2 |
+
warnings.warn(
|
3 |
+
0%| | 1/20000 [00:06<38:19:46, 6.90s/it]
|
4 |
+
0%| | 1/20000 [00:06<38:19:46, 6.90s/it]Traceback (most recent call last):
|
5 |
+
File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 217, in <module>
|
6 |
+
main()
|
7 |
+
File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 172, in main
|
8 |
+
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
9 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
10 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 361, in train
|
11 |
+
output = super().train(*args, **kwargs)
|
12 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
13 |
+
File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 1849, in train
|
14 |
+
return inner_training_loop(
|
15 |
+
^^^^^^^^^^^^^^^^^^^^
|
16 |
+
File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 2202, in _inner_training_loop
|
17 |
+
tr_loss_step = self.training_step(model, inputs)
|
18 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
19 |
+
File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3137, in training_step
|
20 |
+
loss = self.compute_loss(model, inputs)
|
21 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
22 |
+
File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3160, in compute_loss
|
23 |
+
outputs = model(**inputs)
|
24 |
+
^^^^^^^^^^^^^^^
|
25 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
|
26 |
+
return self._call_impl(*args, **kwargs)
|
27 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
28 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
|
29 |
+
return forward_call(*args, **kwargs)
|
30 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
31 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1608, in forward
|
32 |
+
else self._run_ddp_forward(*inputs, **kwargs)
|
33 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
34 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1426, in _run_ddp_forward
|
35 |
+
return self.module(*inputs, **kwargs) # type: ignore[index]
|
36 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
37 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
|
38 |
+
return self._call_impl(*args, **kwargs)
|
39 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
40 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
|
41 |
+
return forward_call(*args, **kwargs)
|
42 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
43 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 825, in forward
|
44 |
+
return model_forward(*args, **kwargs)
|
45 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
46 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 813, in __call__
|
47 |
+
return convert_to_fp32(self.model_forward(*args, **kwargs))
|
48 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
49 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
|
50 |
+
return func(*args, **kwargs)
|
51 |
+
^^^^^^^^^^^^^^^^^^^^^
|
52 |
+
File "/fsx/sanchit/transformers/src/transformers/models/mistral/modeling_mistral.py", line 1184, in forward
|
53 |
+
loss = loss_fct(shift_logits, shift_labels)
|
54 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
55 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
|
56 |
+
return self._call_impl(*args, **kwargs)
|
57 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
58 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
|
59 |
+
return forward_call(*args, **kwargs)
|
60 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
61 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1185, in forward
|
62 |
+
return F.cross_entropy(input, target, weight=self.weight,
|
63 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
64 |
+
File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/functional.py", line 3088, in cross_entropy
|
65 |
+
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
|
66 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
67 |
+
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.62 GiB. GPU
|
68 |
+
[rank0]: Traceback (most recent call last):
|
69 |
+
[rank0]: File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 217, in <module>
|
70 |
+
[rank0]: main()
|
71 |
+
[rank0]: File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 172, in main
|
72 |
+
[rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
73 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
74 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 361, in train
|
75 |
+
[rank0]: output = super().train(*args, **kwargs)
|
76 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
77 |
+
[rank0]: File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 1849, in train
|
78 |
+
[rank0]: return inner_training_loop(
|
79 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^
|
80 |
+
[rank0]: File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 2202, in _inner_training_loop
|
81 |
+
[rank0]: tr_loss_step = self.training_step(model, inputs)
|
82 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
83 |
+
[rank0]: File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3137, in training_step
|
84 |
+
[rank0]: loss = self.compute_loss(model, inputs)
|
85 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
86 |
+
[rank0]: File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3160, in compute_loss
|
87 |
+
[rank0]: outputs = model(**inputs)
|
88 |
+
[rank0]: ^^^^^^^^^^^^^^^
|
89 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
|
90 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
91 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
92 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
|
93 |
+
[rank0]: return forward_call(*args, **kwargs)
|
94 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
95 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1608, in forward
|
96 |
+
[rank0]: else self._run_ddp_forward(*inputs, **kwargs)
|
97 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
98 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1426, in _run_ddp_forward
|
99 |
+
[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index]
|
100 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
101 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
|
102 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
103 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
104 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
|
105 |
+
[rank0]: return forward_call(*args, **kwargs)
|
106 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
107 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 825, in forward
|
108 |
+
[rank0]: return model_forward(*args, **kwargs)
|
109 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
110 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 813, in __call__
|
111 |
+
[rank0]: return convert_to_fp32(self.model_forward(*args, **kwargs))
|
112 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
113 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
|
114 |
+
[rank0]: return func(*args, **kwargs)
|
115 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
|
116 |
+
[rank0]: File "/fsx/sanchit/transformers/src/transformers/models/mistral/modeling_mistral.py", line 1184, in forward
|
117 |
+
[rank0]: loss = loss_fct(shift_logits, shift_labels)
|
118 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
119 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
|
120 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
121 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
122 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
|
123 |
+
[rank0]: return forward_call(*args, **kwargs)
|
124 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
125 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1185, in forward
|
126 |
+
[rank0]: return F.cross_entropy(input, target, weight=self.weight,
|
127 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
128 |
+
[rank0]: File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/functional.py", line 3088, in cross_entropy
|
129 |
+
[rank0]: return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
|
130 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
131 |
+
[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.62 GiB. GPU
|
wandb/run-20240424_154339-mwp0iutr/files/requirements.txt
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.0.0
|
2 |
+
accelerate==0.29.3
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
anyio==4.2.0
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
arrow==1.3.0
|
11 |
+
asttokens==2.4.1
|
12 |
+
astunparse==1.6.3
|
13 |
+
async-lru==2.0.4
|
14 |
+
attrs==23.1.0
|
15 |
+
audioread==3.0.1
|
16 |
+
babel==2.14.0
|
17 |
+
beautifulsoup4==4.12.3
|
18 |
+
bitsandbytes==0.43.1
|
19 |
+
bleach==6.1.0
|
20 |
+
brotli==1.0.9
|
21 |
+
cachetools==5.3.2
|
22 |
+
certifi==2023.11.17
|
23 |
+
cffi==1.16.0
|
24 |
+
chardet==5.2.0
|
25 |
+
charset-normalizer==2.0.4
|
26 |
+
click==8.1.7
|
27 |
+
comm==0.2.1
|
28 |
+
cryptography==41.0.7
|
29 |
+
datasets==2.18.1.dev0
|
30 |
+
debugpy==1.8.1
|
31 |
+
decorator==5.1.1
|
32 |
+
deepspeed==0.12.2
|
33 |
+
defusedxml==0.7.1
|
34 |
+
dill==0.3.7
|
35 |
+
docker-pycreds==0.4.0
|
36 |
+
docstring-parser==0.15
|
37 |
+
einops==0.7.0
|
38 |
+
evaluate==0.4.0
|
39 |
+
executing==2.0.1
|
40 |
+
fastjsonschema==2.19.1
|
41 |
+
filelock==3.13.1
|
42 |
+
flatbuffers==23.5.26
|
43 |
+
fqdn==1.5.1
|
44 |
+
frozenlist==1.4.1
|
45 |
+
fsspec==2023.10.0
|
46 |
+
gast==0.5.4
|
47 |
+
gitdb==4.0.11
|
48 |
+
gitpython==3.1.40
|
49 |
+
gmpy2==2.1.2
|
50 |
+
google-auth-oauthlib==1.2.0
|
51 |
+
google-auth==2.26.1
|
52 |
+
google-pasta==0.2.0
|
53 |
+
grpcio==1.60.0
|
54 |
+
h11==0.14.0
|
55 |
+
h5py==3.10.0
|
56 |
+
hf-transfer==0.1.5
|
57 |
+
hjson==3.1.0
|
58 |
+
httpcore==1.0.2
|
59 |
+
httpx==0.26.0
|
60 |
+
huggingface-hub==0.22.2
|
61 |
+
idna==3.4
|
62 |
+
ipdb==0.13.13
|
63 |
+
ipykernel==6.29.2
|
64 |
+
ipython==8.21.0
|
65 |
+
isoduration==20.11.0
|
66 |
+
jedi==0.19.1
|
67 |
+
jinja2==3.1.2
|
68 |
+
jiwer==3.0.3
|
69 |
+
joblib==1.3.2
|
70 |
+
json5==0.9.14
|
71 |
+
jsonpointer==2.4
|
72 |
+
jsonschema-specifications==2023.12.1
|
73 |
+
jsonschema==4.21.1
|
74 |
+
jupyter-client==8.6.0
|
75 |
+
jupyter-core==5.7.1
|
76 |
+
jupyter-events==0.9.0
|
77 |
+
jupyter-lsp==2.2.2
|
78 |
+
jupyter-server-terminals==0.5.2
|
79 |
+
jupyter-server==2.12.5
|
80 |
+
jupyterlab-pygments==0.3.0
|
81 |
+
jupyterlab-server==2.25.2
|
82 |
+
jupyterlab==4.1.1
|
83 |
+
keras==2.15.0
|
84 |
+
lazy-loader==0.3
|
85 |
+
libclang==16.0.6
|
86 |
+
librosa==0.10.1
|
87 |
+
llvmlite==0.41.1
|
88 |
+
markdown-it-py==3.0.0
|
89 |
+
markdown==3.5.1
|
90 |
+
markupsafe==2.1.1
|
91 |
+
matplotlib-inline==0.1.6
|
92 |
+
mdurl==0.1.2
|
93 |
+
mistune==3.0.2
|
94 |
+
mkl-fft==1.3.8
|
95 |
+
mkl-random==1.2.4
|
96 |
+
mkl-service==2.4.0
|
97 |
+
ml-dtypes==0.2.0
|
98 |
+
mpmath==1.3.0
|
99 |
+
msgpack==1.0.7
|
100 |
+
multidict==6.0.4
|
101 |
+
multiprocess==0.70.15
|
102 |
+
nbclient==0.9.0
|
103 |
+
nbconvert==7.16.0
|
104 |
+
nbformat==5.9.2
|
105 |
+
nest-asyncio==1.6.0
|
106 |
+
networkx==3.1
|
107 |
+
ninja==1.11.1.1
|
108 |
+
nltk==3.8.1
|
109 |
+
notebook-shim==0.2.3
|
110 |
+
numba==0.58.1
|
111 |
+
numpy==1.26.2
|
112 |
+
nvidia-cublas-cu12==12.1.3.1
|
113 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
114 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
115 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
116 |
+
nvidia-cudnn-cu12==8.9.2.26
|
117 |
+
nvidia-cufft-cu12==11.0.2.54
|
118 |
+
nvidia-curand-cu12==10.3.2.106
|
119 |
+
nvidia-cusolver-cu12==11.4.5.107
|
120 |
+
nvidia-cusparse-cu12==12.1.0.106
|
121 |
+
nvidia-nccl-cu12==2.20.5
|
122 |
+
nvidia-nvjitlink-cu12==12.3.101
|
123 |
+
nvidia-nvtx-cu12==12.1.105
|
124 |
+
oauthlib==3.2.2
|
125 |
+
opt-einsum==3.3.0
|
126 |
+
overrides==7.7.0
|
127 |
+
packaging==23.2
|
128 |
+
pandas==2.1.4
|
129 |
+
pandocfilters==1.5.1
|
130 |
+
parso==0.8.3
|
131 |
+
peft==0.7.1
|
132 |
+
pexpect==4.9.0
|
133 |
+
pillow==10.2.0
|
134 |
+
pip==24.0
|
135 |
+
platformdirs==4.1.0
|
136 |
+
pooch==1.8.0
|
137 |
+
prometheus-client==0.19.0
|
138 |
+
prompt-toolkit==3.0.43
|
139 |
+
protobuf==3.20.2
|
140 |
+
psutil==5.9.7
|
141 |
+
ptyprocess==0.7.0
|
142 |
+
pure-eval==0.2.2
|
143 |
+
py-cpuinfo==9.0.0
|
144 |
+
pyarrow-hotfix==0.6
|
145 |
+
pyarrow==14.0.2
|
146 |
+
pyasn1-modules==0.3.0
|
147 |
+
pyasn1==0.5.1
|
148 |
+
pycparser==2.21
|
149 |
+
pydantic-core==2.16.1
|
150 |
+
pydantic==2.6.0
|
151 |
+
pygments==2.17.2
|
152 |
+
pynvml==11.5.0
|
153 |
+
pyopenssl==23.2.0
|
154 |
+
pysocks==1.7.1
|
155 |
+
python-dateutil==2.8.2
|
156 |
+
python-json-logger==2.0.7
|
157 |
+
pytorch-triton==3.0.0+989adb9a29
|
158 |
+
pytz==2023.3.post1
|
159 |
+
pyyaml==6.0.1
|
160 |
+
pyzmq==25.1.2
|
161 |
+
rapidfuzz==3.6.1
|
162 |
+
referencing==0.33.0
|
163 |
+
regex==2023.12.25
|
164 |
+
requests-oauthlib==1.3.1
|
165 |
+
requests==2.31.0
|
166 |
+
responses==0.18.0
|
167 |
+
rfc3339-validator==0.1.4
|
168 |
+
rfc3986-validator==0.1.1
|
169 |
+
rich==13.7.0
|
170 |
+
rpds-py==0.17.1
|
171 |
+
rsa==4.9
|
172 |
+
safetensors==0.4.1
|
173 |
+
scikit-learn==1.3.2
|
174 |
+
scipy==1.11.4
|
175 |
+
send2trash==1.8.2
|
176 |
+
sentencepiece==0.1.99
|
177 |
+
sentry-sdk==1.39.1
|
178 |
+
setproctitle==1.3.3
|
179 |
+
setuptools==68.2.2
|
180 |
+
shtab==1.6.5
|
181 |
+
six==1.16.0
|
182 |
+
smmap==5.0.1
|
183 |
+
sniffio==1.3.0
|
184 |
+
soundfile==0.12.1
|
185 |
+
soupsieve==2.5
|
186 |
+
soxr==0.3.7
|
187 |
+
stack-data==0.6.3
|
188 |
+
sympy==1.12
|
189 |
+
tensorboard-data-server==0.7.2
|
190 |
+
tensorboard==2.15.1
|
191 |
+
tensorflow-cpu==2.15.0.post1
|
192 |
+
tensorflow-estimator==2.15.0
|
193 |
+
tensorflow-io-gcs-filesystem==0.35.0
|
194 |
+
termcolor==2.4.0
|
195 |
+
terminado==0.18.0
|
196 |
+
threadpoolctl==3.2.0
|
197 |
+
tinycss2==1.2.1
|
198 |
+
tokenizers==0.15.0
|
199 |
+
torch==2.4.0.dev20240323+cu121
|
200 |
+
torchaudio==2.2.0.dev20240323+cu121
|
201 |
+
torchvision==0.19.0.dev20240323+cu121
|
202 |
+
tornado==6.4
|
203 |
+
tqdm==4.66.1
|
204 |
+
traitlets==5.14.1
|
205 |
+
transformers==4.39.0.dev0
|
206 |
+
triton==2.2.0
|
207 |
+
trl==0.8.6
|
208 |
+
types-python-dateutil==2.8.19.20240106
|
209 |
+
typing-extensions==4.10.0
|
210 |
+
tyro==0.7.0
|
211 |
+
tzdata==2023.3
|
212 |
+
uri-template==1.3.0
|
213 |
+
urllib3==1.26.18
|
214 |
+
wandb==0.16.1
|
215 |
+
wcwidth==0.2.13
|
216 |
+
webcolors==1.13
|
217 |
+
webencodings==0.5.1
|
218 |
+
websocket-client==1.7.0
|
219 |
+
werkzeug==3.0.1
|
220 |
+
wheel==0.41.2
|
221 |
+
wrapt==1.14.1
|
222 |
+
xxhash==3.4.1
|
223 |
+
yarl==1.9.4
|
wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json
ADDED
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
|
3 |
+
"python": "3.11.5",
|
4 |
+
"heartbeatAt": "2024-04-24T15:43:39.965097",
|
5 |
+
"startedAt": "2024-04-24T15:43:39.449266",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"config_full.yaml"
|
10 |
+
],
|
11 |
+
"state": "running",
|
12 |
+
"program": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py",
|
13 |
+
"codePathLocal": "run_sft.py",
|
14 |
+
"codePath": "run_sft.py",
|
15 |
+
"git": {
|
16 |
+
"remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat",
|
17 |
+
"commit": "cbea69c6b95c970317a1e47c3f614b55b33f8ed9"
|
18 |
+
},
|
19 |
+
"email": null,
|
20 |
+
"root": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat",
|
21 |
+
"host": "ip-26-0-162-233",
|
22 |
+
"username": "sanchit",
|
23 |
+
"executable": "/fsx/sanchit/miniconda3/envs/venv/bin/python",
|
24 |
+
"cpu_count": 96,
|
25 |
+
"cpu_count_logical": 96,
|
26 |
+
"cpu_freq": {
|
27 |
+
"current": 2721.9698645833337,
|
28 |
+
"min": 0.0,
|
29 |
+
"max": 0.0
|
30 |
+
},
|
31 |
+
"cpu_freq_per_core": [
|
32 |
+
{
|
33 |
+
"current": 3590.538,
|
34 |
+
"min": 0.0,
|
35 |
+
"max": 0.0
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"current": 2650.0,
|
39 |
+
"min": 0.0,
|
40 |
+
"max": 0.0
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"current": 2650.0,
|
44 |
+
"min": 0.0,
|
45 |
+
"max": 0.0
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"current": 2650.0,
|
49 |
+
"min": 0.0,
|
50 |
+
"max": 0.0
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"current": 2650.0,
|
54 |
+
"min": 0.0,
|
55 |
+
"max": 0.0
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"current": 2650.0,
|
59 |
+
"min": 0.0,
|
60 |
+
"max": 0.0
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"current": 2650.0,
|
64 |
+
"min": 0.0,
|
65 |
+
"max": 0.0
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"current": 2650.0,
|
69 |
+
"min": 0.0,
|
70 |
+
"max": 0.0
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"current": 2650.0,
|
74 |
+
"min": 0.0,
|
75 |
+
"max": 0.0
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"current": 2650.0,
|
79 |
+
"min": 0.0,
|
80 |
+
"max": 0.0
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"current": 2650.0,
|
84 |
+
"min": 0.0,
|
85 |
+
"max": 0.0
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"current": 2650.0,
|
89 |
+
"min": 0.0,
|
90 |
+
"max": 0.0
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"current": 2650.0,
|
94 |
+
"min": 0.0,
|
95 |
+
"max": 0.0
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"current": 2650.0,
|
99 |
+
"min": 0.0,
|
100 |
+
"max": 0.0
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"current": 3595.996,
|
104 |
+
"min": 0.0,
|
105 |
+
"max": 0.0
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"current": 2650.0,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2650.0,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 2650.0,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2650.0,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2650.0,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2650.0,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2650.0,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2650.0,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 3597.59,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 2650.0,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 3399.936,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2650.0,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 2650.0,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2650.0,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 3598.273,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2650.0,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2650.0,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2650.0,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"current": 2650.0,
|
199 |
+
"min": 0.0,
|
200 |
+
"max": 0.0
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"current": 2650.0,
|
204 |
+
"min": 0.0,
|
205 |
+
"max": 0.0
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"current": 2650.0,
|
209 |
+
"min": 0.0,
|
210 |
+
"max": 0.0
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"current": 2650.0,
|
214 |
+
"min": 0.0,
|
215 |
+
"max": 0.0
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"current": 3597.284,
|
219 |
+
"min": 0.0,
|
220 |
+
"max": 0.0
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"current": 3036.337,
|
224 |
+
"min": 0.0,
|
225 |
+
"max": 0.0
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"current": 2650.0,
|
229 |
+
"min": 0.0,
|
230 |
+
"max": 0.0
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"current": 3597.887,
|
234 |
+
"min": 0.0,
|
235 |
+
"max": 0.0
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"current": 2650.0,
|
239 |
+
"min": 0.0,
|
240 |
+
"max": 0.0
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"current": 3598.442,
|
244 |
+
"min": 0.0,
|
245 |
+
"max": 0.0
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"current": 2650.0,
|
249 |
+
"min": 0.0,
|
250 |
+
"max": 0.0
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"current": 2650.0,
|
254 |
+
"min": 0.0,
|
255 |
+
"max": 0.0
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"current": 2650.0,
|
259 |
+
"min": 0.0,
|
260 |
+
"max": 0.0
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"current": 2650.0,
|
264 |
+
"min": 0.0,
|
265 |
+
"max": 0.0
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"current": 2650.0,
|
269 |
+
"min": 0.0,
|
270 |
+
"max": 0.0
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"current": 2650.0,
|
274 |
+
"min": 0.0,
|
275 |
+
"max": 0.0
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"current": 2650.0,
|
279 |
+
"min": 0.0,
|
280 |
+
"max": 0.0
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"current": 2650.0,
|
284 |
+
"min": 0.0,
|
285 |
+
"max": 0.0
|
286 |
+
},
|
287 |
+
{
|
288 |
+
"current": 2650.0,
|
289 |
+
"min": 0.0,
|
290 |
+
"max": 0.0
|
291 |
+
},
|
292 |
+
{
|
293 |
+
"current": 2650.0,
|
294 |
+
"min": 0.0,
|
295 |
+
"max": 0.0
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"current": 2650.0,
|
299 |
+
"min": 0.0,
|
300 |
+
"max": 0.0
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"current": 2650.0,
|
304 |
+
"min": 0.0,
|
305 |
+
"max": 0.0
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"current": 2650.0,
|
309 |
+
"min": 0.0,
|
310 |
+
"max": 0.0
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"current": 2650.0,
|
314 |
+
"min": 0.0,
|
315 |
+
"max": 0.0
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"current": 2650.0,
|
319 |
+
"min": 0.0,
|
320 |
+
"max": 0.0
|
321 |
+
},
|
322 |
+
{
|
323 |
+
"current": 2650.0,
|
324 |
+
"min": 0.0,
|
325 |
+
"max": 0.0
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"current": 2650.0,
|
329 |
+
"min": 0.0,
|
330 |
+
"max": 0.0
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"current": 2650.0,
|
334 |
+
"min": 0.0,
|
335 |
+
"max": 0.0
|
336 |
+
},
|
337 |
+
{
|
338 |
+
"current": 2650.0,
|
339 |
+
"min": 0.0,
|
340 |
+
"max": 0.0
|
341 |
+
},
|
342 |
+
{
|
343 |
+
"current": 2650.0,
|
344 |
+
"min": 0.0,
|
345 |
+
"max": 0.0
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"current": 2650.0,
|
349 |
+
"min": 0.0,
|
350 |
+
"max": 0.0
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"current": 2650.0,
|
354 |
+
"min": 0.0,
|
355 |
+
"max": 0.0
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"current": 2650.0,
|
359 |
+
"min": 0.0,
|
360 |
+
"max": 0.0
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"current": 2650.0,
|
364 |
+
"min": 0.0,
|
365 |
+
"max": 0.0
|
366 |
+
},
|
367 |
+
{
|
368 |
+
"current": 2650.0,
|
369 |
+
"min": 0.0,
|
370 |
+
"max": 0.0
|
371 |
+
},
|
372 |
+
{
|
373 |
+
"current": 2650.0,
|
374 |
+
"min": 0.0,
|
375 |
+
"max": 0.0
|
376 |
+
},
|
377 |
+
{
|
378 |
+
"current": 2650.0,
|
379 |
+
"min": 0.0,
|
380 |
+
"max": 0.0
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"current": 2650.0,
|
384 |
+
"min": 0.0,
|
385 |
+
"max": 0.0
|
386 |
+
},
|
387 |
+
{
|
388 |
+
"current": 2650.0,
|
389 |
+
"min": 0.0,
|
390 |
+
"max": 0.0
|
391 |
+
},
|
392 |
+
{
|
393 |
+
"current": 2650.0,
|
394 |
+
"min": 0.0,
|
395 |
+
"max": 0.0
|
396 |
+
},
|
397 |
+
{
|
398 |
+
"current": 2650.0,
|
399 |
+
"min": 0.0,
|
400 |
+
"max": 0.0
|
401 |
+
},
|
402 |
+
{
|
403 |
+
"current": 2650.0,
|
404 |
+
"min": 0.0,
|
405 |
+
"max": 0.0
|
406 |
+
},
|
407 |
+
{
|
408 |
+
"current": 2650.0,
|
409 |
+
"min": 0.0,
|
410 |
+
"max": 0.0
|
411 |
+
},
|
412 |
+
{
|
413 |
+
"current": 2650.0,
|
414 |
+
"min": 0.0,
|
415 |
+
"max": 0.0
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"current": 2650.0,
|
419 |
+
"min": 0.0,
|
420 |
+
"max": 0.0
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"current": 2650.0,
|
424 |
+
"min": 0.0,
|
425 |
+
"max": 0.0
|
426 |
+
},
|
427 |
+
{
|
428 |
+
"current": 2650.0,
|
429 |
+
"min": 0.0,
|
430 |
+
"max": 0.0
|
431 |
+
},
|
432 |
+
{
|
433 |
+
"current": 2650.0,
|
434 |
+
"min": 0.0,
|
435 |
+
"max": 0.0
|
436 |
+
},
|
437 |
+
{
|
438 |
+
"current": 2650.0,
|
439 |
+
"min": 0.0,
|
440 |
+
"max": 0.0
|
441 |
+
},
|
442 |
+
{
|
443 |
+
"current": 2650.0,
|
444 |
+
"min": 0.0,
|
445 |
+
"max": 0.0
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"current": 2650.0,
|
449 |
+
"min": 0.0,
|
450 |
+
"max": 0.0
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"current": 2650.0,
|
454 |
+
"min": 0.0,
|
455 |
+
"max": 0.0
|
456 |
+
},
|
457 |
+
{
|
458 |
+
"current": 2650.0,
|
459 |
+
"min": 0.0,
|
460 |
+
"max": 0.0
|
461 |
+
},
|
462 |
+
{
|
463 |
+
"current": 2650.0,
|
464 |
+
"min": 0.0,
|
465 |
+
"max": 0.0
|
466 |
+
},
|
467 |
+
{
|
468 |
+
"current": 2650.0,
|
469 |
+
"min": 0.0,
|
470 |
+
"max": 0.0
|
471 |
+
},
|
472 |
+
{
|
473 |
+
"current": 2650.0,
|
474 |
+
"min": 0.0,
|
475 |
+
"max": 0.0
|
476 |
+
},
|
477 |
+
{
|
478 |
+
"current": 2650.0,
|
479 |
+
"min": 0.0,
|
480 |
+
"max": 0.0
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"current": 2650.0,
|
484 |
+
"min": 0.0,
|
485 |
+
"max": 0.0
|
486 |
+
},
|
487 |
+
{
|
488 |
+
"current": 2650.0,
|
489 |
+
"min": 0.0,
|
490 |
+
"max": 0.0
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"current": 2650.0,
|
494 |
+
"min": 0.0,
|
495 |
+
"max": 0.0
|
496 |
+
},
|
497 |
+
{
|
498 |
+
"current": 2650.0,
|
499 |
+
"min": 0.0,
|
500 |
+
"max": 0.0
|
501 |
+
},
|
502 |
+
{
|
503 |
+
"current": 2650.0,
|
504 |
+
"min": 0.0,
|
505 |
+
"max": 0.0
|
506 |
+
},
|
507 |
+
{
|
508 |
+
"current": 2650.0,
|
509 |
+
"min": 0.0,
|
510 |
+
"max": 0.0
|
511 |
+
}
|
512 |
+
],
|
513 |
+
"disk": {
|
514 |
+
"/": {
|
515 |
+
"total": 290.7472343444824,
|
516 |
+
"used": 59.263893127441406
|
517 |
+
}
|
518 |
+
},
|
519 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
520 |
+
"gpu_count": 8,
|
521 |
+
"gpu_devices": [
|
522 |
+
{
|
523 |
+
"name": "NVIDIA H100 80GB HBM3",
|
524 |
+
"memory_total": 85520809984
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"name": "NVIDIA H100 80GB HBM3",
|
528 |
+
"memory_total": 85520809984
|
529 |
+
},
|
530 |
+
{
|
531 |
+
"name": "NVIDIA H100 80GB HBM3",
|
532 |
+
"memory_total": 85520809984
|
533 |
+
},
|
534 |
+
{
|
535 |
+
"name": "NVIDIA H100 80GB HBM3",
|
536 |
+
"memory_total": 85520809984
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"name": "NVIDIA H100 80GB HBM3",
|
540 |
+
"memory_total": 85520809984
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"name": "NVIDIA H100 80GB HBM3",
|
544 |
+
"memory_total": 85520809984
|
545 |
+
},
|
546 |
+
{
|
547 |
+
"name": "NVIDIA H100 80GB HBM3",
|
548 |
+
"memory_total": 85520809984
|
549 |
+
},
|
550 |
+
{
|
551 |
+
"name": "NVIDIA H100 80GB HBM3",
|
552 |
+
"memory_total": 85520809984
|
553 |
+
}
|
554 |
+
],
|
555 |
+
"memory": {
|
556 |
+
"total": 1999.9855270385742
|
557 |
+
}
|
558 |
+
}
|
wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"train/loss": 14.0246, "train/grad_norm": 1440.0, "train/learning_rate": 2.0000000000000002e-07, "train/epoch": 0.0, "train/global_step": 1, "_timestamp": 1713973432.7827635, "_runtime": 13.312107563018799, "_step": 0, "_wandb": {"runtime": 14}}
|
wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-04-24 15:43:39,468 INFO StreamThr :1848599 [internal.py:wandb_internal():86] W&B internal server running at pid: 1848599, started at: 2024-04-24 15:43:39.467078
|
2 |
+
2024-04-24 15:43:39,469 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status
|
3 |
+
2024-04-24 15:43:39,473 INFO WriterThread:1848599 [datastore.py:open_for_write():85] open: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb
|
4 |
+
2024-04-24 15:43:39,476 DEBUG SenderThread:1848599 [sender.py:send():382] send: header
|
5 |
+
2024-04-24 15:43:39,521 DEBUG SenderThread:1848599 [sender.py:send():382] send: run
|
6 |
+
2024-04-24 15:43:39,793 INFO SenderThread:1848599 [dir_watcher.py:__init__():211] watching files in: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files
|
7 |
+
2024-04-24 15:43:39,793 INFO SenderThread:1848599 [sender.py:_start_run_threads():1136] run started: mwp0iutr with start time 1713973419.470656
|
8 |
+
2024-04-24 15:43:39,798 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: check_version
|
9 |
+
2024-04-24 15:43:39,799 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: check_version
|
10 |
+
2024-04-24 15:43:39,851 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: run_start
|
11 |
+
2024-04-24 15:43:39,908 DEBUG HandlerThread:1848599 [system_info.py:__init__():32] System info init
|
12 |
+
2024-04-24 15:43:39,908 DEBUG HandlerThread:1848599 [system_info.py:__init__():47] System info init done
|
13 |
+
2024-04-24 15:43:39,908 INFO HandlerThread:1848599 [system_monitor.py:start():194] Starting system monitor
|
14 |
+
2024-04-24 15:43:39,908 INFO SystemMonitor:1848599 [system_monitor.py:_start():158] Starting system asset monitoring threads
|
15 |
+
2024-04-24 15:43:39,909 INFO HandlerThread:1848599 [system_monitor.py:probe():214] Collecting system info
|
16 |
+
2024-04-24 15:43:39,909 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started cpu monitoring
|
17 |
+
2024-04-24 15:43:39,909 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started disk monitoring
|
18 |
+
2024-04-24 15:43:39,910 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started gpu monitoring
|
19 |
+
2024-04-24 15:43:39,911 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started memory monitoring
|
20 |
+
2024-04-24 15:43:39,911 INFO SystemMonitor:1848599 [interfaces.py:start():190] Started network monitoring
|
21 |
+
2024-04-24 15:43:39,965 DEBUG HandlerThread:1848599 [system_info.py:probe():196] Probing system
|
22 |
+
2024-04-24 15:43:39,967 DEBUG HandlerThread:1848599 [system_info.py:_probe_git():181] Probing git
|
23 |
+
2024-04-24 15:43:39,987 DEBUG HandlerThread:1848599 [system_info.py:_probe_git():189] Probing git done
|
24 |
+
2024-04-24 15:43:39,987 DEBUG HandlerThread:1848599 [system_info.py:probe():244] Probing system done
|
25 |
+
2024-04-24 15:43:39,987 DEBUG HandlerThread:1848599 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-1048-aws-x86_64-with-glibc2.31', 'python': '3.11.5', 'heartbeatAt': '2024-04-24T15:43:39.965097', 'startedAt': '2024-04-24T15:43:39.449266', 'docker': None, 'cuda': None, 'args': ('config_full.yaml',), 'state': 'running', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'codePathLocal': 'run_sft.py', 'codePath': 'run_sft.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat', 'commit': 'cbea69c6b95c970317a1e47c3f614b55b33f8ed9'}, 'email': None, 'root': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat', 'host': 'ip-26-0-162-233', 'username': 'sanchit', 'executable': '/fsx/sanchit/miniconda3/envs/venv/bin/python', 'cpu_count': 96, 'cpu_count_logical': 96, 'cpu_freq': {'current': 2721.9698645833337, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 3590.538, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3595.996, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.59, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3399.936, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.273, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.284, 'min': 0.0, 'max': 0.0}, {'current': 3036.337, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.887, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.442, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 290.7472343444824, 'used': 59.263893127441406}}, 'gpu': 'NVIDIA H100 80GB HBM3', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}], 'memory': {'total': 1999.9855270385742}}
|
26 |
+
2024-04-24 15:43:39,988 INFO HandlerThread:1848599 [system_monitor.py:probe():224] Finished collecting system info
|
27 |
+
2024-04-24 15:43:39,988 INFO HandlerThread:1848599 [system_monitor.py:probe():227] Publishing system info
|
28 |
+
2024-04-24 15:43:39,988 DEBUG HandlerThread:1848599 [system_info.py:_save_pip():52] Saving list of pip packages installed into the current environment
|
29 |
+
2024-04-24 15:43:39,989 DEBUG HandlerThread:1848599 [system_info.py:_save_pip():68] Saving pip packages done
|
30 |
+
2024-04-24 15:43:39,990 DEBUG HandlerThread:1848599 [system_info.py:_save_conda():75] Saving list of conda packages installed into the current environment
|
31 |
+
2024-04-24 15:43:40,795 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
|
32 |
+
2024-04-24 15:43:40,796 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt
|
33 |
+
2024-04-24 15:43:45,799 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
|
34 |
+
2024-04-24 15:43:45,805 DEBUG HandlerThread:1848599 [system_info.py:_save_conda():87] Saving conda packages done
|
35 |
+
2024-04-24 15:43:45,807 INFO HandlerThread:1848599 [system_monitor.py:probe():229] Finished publishing system info
|
36 |
+
2024-04-24 15:43:45,857 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
|
37 |
+
2024-04-24 15:43:45,857 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: keepalive
|
38 |
+
2024-04-24 15:43:45,858 DEBUG SenderThread:1848599 [sender.py:send():382] send: files
|
39 |
+
2024-04-24 15:43:45,858 INFO SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-metadata.json with policy now
|
40 |
+
2024-04-24 15:43:45,864 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: stop_status
|
41 |
+
2024-04-24 15:43:45,865 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: stop_status
|
42 |
+
2024-04-24 15:43:45,867 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: internal_messages
|
43 |
+
2024-04-24 15:43:45,993 DEBUG SenderThread:1848599 [sender.py:send():382] send: telemetry
|
44 |
+
2024-04-24 15:43:45,993 DEBUG SenderThread:1848599 [sender.py:send():382] send: config
|
45 |
+
2024-04-24 15:43:45,993 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
|
46 |
+
2024-04-24 15:43:45,994 DEBUG SenderThread:1848599 [sender.py:send():382] send: telemetry
|
47 |
+
2024-04-24 15:43:45,994 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
|
48 |
+
2024-04-24 15:43:45,994 WARNING SenderThread:1848599 [sender.py:send_metric():1343] Seen metric with glob (shouldn't happen)
|
49 |
+
2024-04-24 15:43:45,994 DEBUG SenderThread:1848599 [sender.py:send():382] send: telemetry
|
50 |
+
2024-04-24 15:43:46,179 INFO wandb-upload_0:1848599 [upload_job.py:push():131] Uploaded file /tmp/tmphsb5r9cdwandb/sgr8lmob-wandb-metadata.json
|
51 |
+
2024-04-24 15:43:46,800 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json
|
52 |
+
2024-04-24 15:43:46,801 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
|
53 |
+
2024-04-24 15:43:48,803 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
|
54 |
+
2024-04-24 15:43:50,251 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
|
55 |
+
2024-04-24 15:43:52,783 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: partial_history
|
56 |
+
2024-04-24 15:43:52,785 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
|
57 |
+
2024-04-24 15:43:52,785 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
|
58 |
+
2024-04-24 15:43:52,786 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
|
59 |
+
2024-04-24 15:43:52,786 DEBUG SenderThread:1848599 [sender.py:send():382] send: metric
|
60 |
+
2024-04-24 15:43:52,786 DEBUG SenderThread:1848599 [sender.py:send():382] send: history
|
61 |
+
2024-04-24 15:43:52,786 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: summary_record
|
62 |
+
2024-04-24 15:43:52,788 INFO SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
|
63 |
+
2024-04-24 15:43:52,807 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
|
64 |
+
2024-04-24 15:43:54,212 DEBUG SenderThread:1848599 [sender.py:send():382] send: exit
|
65 |
+
2024-04-24 15:43:54,212 INFO SenderThread:1848599 [sender.py:send_exit():589] handling exit code: 1
|
66 |
+
2024-04-24 15:43:54,212 INFO SenderThread:1848599 [sender.py:send_exit():591] handling runtime: 14
|
67 |
+
2024-04-24 15:43:54,213 INFO SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
|
68 |
+
2024-04-24 15:43:54,213 INFO SenderThread:1848599 [sender.py:send_exit():597] send defer
|
69 |
+
2024-04-24 15:43:54,213 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
70 |
+
2024-04-24 15:43:54,213 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 0
|
71 |
+
2024-04-24 15:43:54,214 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
72 |
+
2024-04-24 15:43:54,214 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 0
|
73 |
+
2024-04-24 15:43:54,214 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 1
|
74 |
+
2024-04-24 15:43:54,214 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
75 |
+
2024-04-24 15:43:54,214 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 1
|
76 |
+
2024-04-24 15:43:54,214 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
77 |
+
2024-04-24 15:43:54,214 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 1
|
78 |
+
2024-04-24 15:43:54,214 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 2
|
79 |
+
2024-04-24 15:43:54,214 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
80 |
+
2024-04-24 15:43:54,214 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 2
|
81 |
+
2024-04-24 15:43:54,214 INFO HandlerThread:1848599 [system_monitor.py:finish():203] Stopping system monitor
|
82 |
+
2024-04-24 15:43:54,214 DEBUG SystemMonitor:1848599 [system_monitor.py:_start():172] Starting system metrics aggregation loop
|
83 |
+
2024-04-24 15:43:54,215 DEBUG SystemMonitor:1848599 [system_monitor.py:_start():179] Finished system metrics aggregation loop
|
84 |
+
2024-04-24 15:43:54,215 DEBUG SystemMonitor:1848599 [system_monitor.py:_start():183] Publishing last batch of metrics
|
85 |
+
2024-04-24 15:43:54,215 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined cpu monitor
|
86 |
+
2024-04-24 15:43:54,217 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined disk monitor
|
87 |
+
2024-04-24 15:43:54,810 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
|
88 |
+
2024-04-24 15:43:54,810 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
|
89 |
+
2024-04-24 15:43:56,812 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
|
90 |
+
2024-04-24 15:43:57,141 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined gpu monitor
|
91 |
+
2024-04-24 15:43:57,142 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined memory monitor
|
92 |
+
2024-04-24 15:43:57,142 INFO HandlerThread:1848599 [interfaces.py:finish():202] Joined network monitor
|
93 |
+
2024-04-24 15:43:57,142 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: poll_exit
|
94 |
+
2024-04-24 15:43:57,143 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
|
95 |
+
2024-04-24 15:43:57,143 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
96 |
+
2024-04-24 15:43:57,143 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 2
|
97 |
+
2024-04-24 15:43:57,143 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 3
|
98 |
+
2024-04-24 15:43:57,144 DEBUG SenderThread:1848599 [sender.py:send():382] send: stats
|
99 |
+
2024-04-24 15:43:57,144 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
100 |
+
2024-04-24 15:43:57,144 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: poll_exit
|
101 |
+
2024-04-24 15:43:57,145 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 3
|
102 |
+
2024-04-24 15:43:57,146 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
103 |
+
2024-04-24 15:43:57,146 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 3
|
104 |
+
2024-04-24 15:43:57,146 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 4
|
105 |
+
2024-04-24 15:43:57,146 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
106 |
+
2024-04-24 15:43:57,146 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 4
|
107 |
+
2024-04-24 15:43:57,147 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
108 |
+
2024-04-24 15:43:57,147 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 4
|
109 |
+
2024-04-24 15:43:57,147 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 5
|
110 |
+
2024-04-24 15:43:57,147 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
111 |
+
2024-04-24 15:43:57,147 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 5
|
112 |
+
2024-04-24 15:43:57,147 DEBUG SenderThread:1848599 [sender.py:send():382] send: summary
|
113 |
+
2024-04-24 15:43:57,149 INFO SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
|
114 |
+
2024-04-24 15:43:57,149 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
115 |
+
2024-04-24 15:43:57,149 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 5
|
116 |
+
2024-04-24 15:43:57,149 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 6
|
117 |
+
2024-04-24 15:43:57,149 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
118 |
+
2024-04-24 15:43:57,149 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 6
|
119 |
+
2024-04-24 15:43:57,149 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
120 |
+
2024-04-24 15:43:57,149 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 6
|
121 |
+
2024-04-24 15:43:57,152 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
|
122 |
+
2024-04-24 15:43:57,275 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 7
|
123 |
+
2024-04-24 15:43:57,275 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
124 |
+
2024-04-24 15:43:57,275 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 7
|
125 |
+
2024-04-24 15:43:57,275 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
126 |
+
2024-04-24 15:43:57,275 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 7
|
127 |
+
2024-04-24 15:43:57,814 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml
|
128 |
+
2024-04-24 15:43:57,814 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
|
129 |
+
2024-04-24 15:43:58,791 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 8
|
130 |
+
2024-04-24 15:43:58,792 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
131 |
+
2024-04-24 15:43:58,792 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 8
|
132 |
+
2024-04-24 15:43:58,792 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
133 |
+
2024-04-24 15:43:58,792 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 8
|
134 |
+
2024-04-24 15:43:58,792 INFO SenderThread:1848599 [job_builder.py:build():298] Attempting to build job artifact
|
135 |
+
2024-04-24 15:43:58,794 INFO SenderThread:1848599 [job_builder.py:_get_source_type():428] is repo sourced job
|
136 |
+
2024-04-24 15:43:58,815 INFO Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
|
137 |
+
2024-04-24 15:43:58,832 INFO SenderThread:1848599 [job_builder.py:build():404] adding wandb-job metadata file
|
138 |
+
2024-04-24 15:43:58,858 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 9
|
139 |
+
2024-04-24 15:43:58,859 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
140 |
+
2024-04-24 15:43:58,859 DEBUG SenderThread:1848599 [sender.py:send():382] send: artifact
|
141 |
+
2024-04-24 15:43:58,859 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 9
|
142 |
+
2024-04-24 15:43:59,524 INFO wandb-upload_0:1848599 [upload_job.py:push():89] Uploaded file /admin/home/sanchit/.local/share/wandb/artifacts/staging/tmp1vajxumh
|
143 |
+
2024-04-24 15:43:59,530 INFO wandb-upload_1:1848599 [upload_job.py:push():89] Uploaded file /admin/home/sanchit/.local/share/wandb/artifacts/staging/tmp824ipvc5
|
144 |
+
2024-04-24 15:44:00,093 INFO SenderThread:1848599 [sender.py:send_artifact():1470] sent artifact job-https___huggingface.co_sanchit-gandhi_distil-zephyr-1.5b-ssft-ultrachat_run_sft.py - {'id': 'QXJ0aWZhY3Q6ODA4NTQyNDIx', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjE2NjI0NzU4Nw==', 'latestArtifact': None}}
|
145 |
+
2024-04-24 15:44:00,093 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
146 |
+
2024-04-24 15:44:00,093 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 9
|
147 |
+
2024-04-24 15:44:00,093 INFO SenderThread:1848599 [dir_watcher.py:finish():358] shutting down directory watcher
|
148 |
+
2024-04-24 15:44:00,213 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: keepalive
|
149 |
+
2024-04-24 15:44:00,816 INFO SenderThread:1848599 [dir_watcher.py:finish():388] scan: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files
|
150 |
+
2024-04-24 15:44:00,817 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml conda-environment.yaml
|
151 |
+
2024-04-24 15:44:00,817 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json wandb-summary.json
|
152 |
+
2024-04-24 15:44:00,817 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log output.log
|
153 |
+
2024-04-24 15:44:00,821 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml config.yaml
|
154 |
+
2024-04-24 15:44:00,824 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt requirements.txt
|
155 |
+
2024-04-24 15:44:00,826 INFO SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json wandb-metadata.json
|
156 |
+
2024-04-24 15:44:00,826 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 10
|
157 |
+
2024-04-24 15:44:00,828 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
158 |
+
2024-04-24 15:44:00,828 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 10
|
159 |
+
2024-04-24 15:44:00,828 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
160 |
+
2024-04-24 15:44:00,828 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 10
|
161 |
+
2024-04-24 15:44:00,828 INFO SenderThread:1848599 [file_pusher.py:finish():175] shutting down file pusher
|
162 |
+
2024-04-24 15:44:01,006 INFO wandb-upload_0:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
|
163 |
+
2024-04-24 15:44:01,059 INFO wandb-upload_1:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
|
164 |
+
2024-04-24 15:44:01,161 INFO wandb-upload_2:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
|
165 |
+
2024-04-24 15:44:01,169 INFO wandb-upload_3:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml
|
166 |
+
2024-04-24 15:44:01,184 INFO wandb-upload_4:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt
|
167 |
+
2024-04-24 15:44:01,384 INFO Thread-11 (_thread_body):1848599 [sender.py:transition_state():617] send defer: 11
|
168 |
+
2024-04-24 15:44:01,385 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
169 |
+
2024-04-24 15:44:01,385 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 11
|
170 |
+
2024-04-24 15:44:01,385 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
171 |
+
2024-04-24 15:44:01,385 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 11
|
172 |
+
2024-04-24 15:44:01,385 INFO SenderThread:1848599 [file_pusher.py:join():181] waiting for file pusher
|
173 |
+
2024-04-24 15:44:01,385 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 12
|
174 |
+
2024-04-24 15:44:01,385 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
175 |
+
2024-04-24 15:44:01,385 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 12
|
176 |
+
2024-04-24 15:44:01,385 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
177 |
+
2024-04-24 15:44:01,385 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 12
|
178 |
+
2024-04-24 15:44:01,386 INFO SenderThread:1848599 [file_stream.py:finish():595] file stream finish called
|
179 |
+
2024-04-24 15:44:01,445 INFO SenderThread:1848599 [file_stream.py:finish():599] file stream finish is done
|
180 |
+
2024-04-24 15:44:01,445 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 13
|
181 |
+
2024-04-24 15:44:01,445 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
182 |
+
2024-04-24 15:44:01,445 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 13
|
183 |
+
2024-04-24 15:44:01,445 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
184 |
+
2024-04-24 15:44:01,445 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 13
|
185 |
+
2024-04-24 15:44:01,445 INFO SenderThread:1848599 [sender.py:transition_state():617] send defer: 14
|
186 |
+
2024-04-24 15:44:01,446 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
|
187 |
+
2024-04-24 15:44:01,446 DEBUG SenderThread:1848599 [sender.py:send():382] send: final
|
188 |
+
2024-04-24 15:44:01,446 INFO HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 14
|
189 |
+
2024-04-24 15:44:01,446 DEBUG SenderThread:1848599 [sender.py:send():382] send: footer
|
190 |
+
2024-04-24 15:44:01,446 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: defer
|
191 |
+
2024-04-24 15:44:01,446 INFO SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 14
|
192 |
+
2024-04-24 15:44:01,447 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: poll_exit
|
193 |
+
2024-04-24 15:44:01,447 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: poll_exit
|
194 |
+
2024-04-24 15:44:01,447 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: server_info
|
195 |
+
2024-04-24 15:44:01,447 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: get_summary
|
196 |
+
2024-04-24 15:44:01,448 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: server_info
|
197 |
+
2024-04-24 15:44:01,449 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: sampled_history
|
198 |
+
2024-04-24 15:44:01,449 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: internal_messages
|
199 |
+
2024-04-24 15:44:01,450 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: job_info
|
200 |
+
2024-04-24 15:44:01,507 DEBUG SenderThread:1848599 [sender.py:send_request():409] send_request: job_info
|
201 |
+
2024-04-24 15:44:01,508 INFO MainThread:1848599 [wandb_run.py:_footer_history_summary_info():3837] rendering history
|
202 |
+
2024-04-24 15:44:01,508 INFO MainThread:1848599 [wandb_run.py:_footer_history_summary_info():3869] rendering summary
|
203 |
+
2024-04-24 15:44:01,508 INFO MainThread:1848599 [wandb_run.py:_footer_sync_info():3796] logging synced files
|
204 |
+
2024-04-24 15:44:01,508 DEBUG HandlerThread:1848599 [handler.py:handle_request():146] handle_request: shutdown
|
205 |
+
2024-04-24 15:44:01,508 INFO HandlerThread:1848599 [handler.py:finish():866] shutting down handler
|
206 |
+
2024-04-24 15:44:02,450 INFO WriterThread:1848599 [datastore.py:close():294] close: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb
|
207 |
+
2024-04-24 15:44:02,508 INFO SenderThread:1848599 [sender.py:finish():1548] shutting down sender
|
208 |
+
2024-04-24 15:44:02,508 INFO SenderThread:1848599 [file_pusher.py:finish():175] shutting down file pusher
|
209 |
+
2024-04-24 15:44:02,508 INFO SenderThread:1848599 [file_pusher.py:join():181] waiting for file pusher
|
wandb/run-20240424_154339-mwp0iutr/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
|
2 |
+
2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Configure stats pid to 1840687
|
3 |
+
2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
|
4 |
+
2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
|
5 |
+
2024-04-24 15:43:39,459 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
|
6 |
+
2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
|
8 |
+
2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/logs/debug.log
|
9 |
+
2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log
|
10 |
+
2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:init():564] calling init triggers
|
11 |
+
2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
|
12 |
+
config: {}
|
13 |
+
2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:init():614] starting backend
|
14 |
+
2024-04-24 15:43:39,460 INFO MainThread:1840687 [wandb_init.py:init():618] setting up manager
|
15 |
+
2024-04-24 15:43:39,465 INFO MainThread:1840687 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-04-24 15:43:39,470 INFO MainThread:1840687 [wandb_init.py:init():624] backend started and connected
|
17 |
+
2024-04-24 15:43:39,472 INFO MainThread:1840687 [wandb_init.py:init():716] updated telemetry
|
18 |
+
2024-04-24 15:43:39,520 INFO MainThread:1840687 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-04-24 15:43:39,798 INFO MainThread:1840687 [wandb_run.py:_on_init():2254] communicating current version
|
20 |
+
2024-04-24 15:43:39,844 INFO MainThread:1840687 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-04-24 15:43:39,844 INFO MainThread:1840687 [wandb_init.py:init():800] starting run threads in backend
|
23 |
+
2024-04-24 15:43:45,864 INFO MainThread:1840687 [wandb_run.py:_console_start():2233] atexit reg
|
24 |
+
2024-04-24 15:43:45,864 INFO MainThread:1840687 [wandb_run.py:_redirect():2088] redirect: wrap_raw
|
25 |
+
2024-04-24 15:43:45,864 INFO MainThread:1840687 [wandb_run.py:_redirect():2153] Wrapping output streams.
|
26 |
+
2024-04-24 15:43:45,864 INFO MainThread:1840687 [wandb_run.py:_redirect():2178] Redirects installed.
|
27 |
+
2024-04-24 15:43:45,866 INFO MainThread:1840687 [wandb_init.py:init():841] run started, returning control to user process
|
28 |
+
2024-04-24 15:43:45,867 INFO MainThread:1840687 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 64, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_14-23-38_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
|
29 |
+
2024-04-24 15:44:02,589 WARNING MsgRouterThr:1840687 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb
ADDED
Binary file (26.4 kB). View file
|
|
wandb/run-20240424_164324-xfbnm7qo/files/conda-environment.yaml
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: venv
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- nvidia
|
5 |
+
- defaults
|
6 |
+
dependencies:
|
7 |
+
- _libgcc_mutex=0.1=main
|
8 |
+
- _openmp_mutex=5.1=1_gnu
|
9 |
+
- blas=1.0=mkl
|
10 |
+
- brotli-python=1.0.9=py311h6a678d5_7
|
11 |
+
- bzip2=1.0.8=h7b6447c_0
|
12 |
+
- ca-certificates=2023.12.12=h06a4308_0
|
13 |
+
- certifi=2023.11.17=py311h06a4308_0
|
14 |
+
- cffi=1.16.0=py311h5eee18b_0
|
15 |
+
- cryptography=41.0.7=py311hdda0065_0
|
16 |
+
- cuda-cudart=12.1.105=0
|
17 |
+
- cuda-cupti=12.1.105=0
|
18 |
+
- cuda-libraries=12.1.0=0
|
19 |
+
- cuda-nvrtc=12.1.105=0
|
20 |
+
- cuda-nvtx=12.1.105=0
|
21 |
+
- cuda-opencl=12.3.101=0
|
22 |
+
- cuda-runtime=12.1.0=0
|
23 |
+
- ffmpeg=4.3=hf484d3e_0
|
24 |
+
- filelock=3.13.1=py311h06a4308_0
|
25 |
+
- freetype=2.12.1=h4a9f257_0
|
26 |
+
- giflib=5.2.1=h5eee18b_3
|
27 |
+
- gmp=6.2.1=h295c915_3
|
28 |
+
- gmpy2=2.1.2=py311hc9b5ff0_0
|
29 |
+
- gnutls=3.6.15=he1e5248_0
|
30 |
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
31 |
+
- jinja2=3.1.2=py311h06a4308_0
|
32 |
+
- jpeg=9e=h5eee18b_1
|
33 |
+
- lame=3.100=h7b6447c_0
|
34 |
+
- lcms2=2.12=h3be6417_0
|
35 |
+
- ld_impl_linux-64=2.38=h1181459_1
|
36 |
+
- lerc=3.0=h295c915_0
|
37 |
+
- libcublas=12.1.0.26=0
|
38 |
+
- libcufft=11.0.2.4=0
|
39 |
+
- libcufile=1.8.1.2=0
|
40 |
+
- libcurand=10.3.4.101=0
|
41 |
+
- libcusolver=11.4.4.55=0
|
42 |
+
- libcusparse=12.0.2.55=0
|
43 |
+
- libdeflate=1.17=h5eee18b_1
|
44 |
+
- libffi=3.4.4=h6a678d5_0
|
45 |
+
- libgcc-ng=11.2.0=h1234567_1
|
46 |
+
- libgomp=11.2.0=h1234567_1
|
47 |
+
- libiconv=1.16=h7f8727e_2
|
48 |
+
- libidn2=2.3.4=h5eee18b_0
|
49 |
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
50 |
+
- libnpp=12.0.2.50=0
|
51 |
+
- libnvjitlink=12.1.105=0
|
52 |
+
- libnvjpeg=12.1.1.14=0
|
53 |
+
- libpng=1.6.39=h5eee18b_0
|
54 |
+
- libstdcxx-ng=11.2.0=h1234567_1
|
55 |
+
- libtasn1=4.19.0=h5eee18b_0
|
56 |
+
- libtiff=4.5.1=h6a678d5_0
|
57 |
+
- libunistring=0.9.10=h27cfd23_0
|
58 |
+
- libuuid=1.41.5=h5eee18b_0
|
59 |
+
- libwebp=1.3.2=h11a3e52_0
|
60 |
+
- libwebp-base=1.3.2=h5eee18b_0
|
61 |
+
- llvm-openmp=14.0.6=h9e868ea_0
|
62 |
+
- lz4-c=1.9.4=h6a678d5_0
|
63 |
+
- markupsafe=2.1.1=py311h5eee18b_0
|
64 |
+
- mkl=2023.1.0=h213fc3f_46344
|
65 |
+
- mkl-service=2.4.0=py311h5eee18b_1
|
66 |
+
- mkl_fft=1.3.8=py311h5eee18b_0
|
67 |
+
- mkl_random=1.2.4=py311hdb19cb5_0
|
68 |
+
- mpc=1.1.0=h10f8cd9_1
|
69 |
+
- mpfr=4.0.2=hb69a4c5_1
|
70 |
+
- mpmath=1.3.0=py311h06a4308_0
|
71 |
+
- ncurses=6.4=h6a678d5_0
|
72 |
+
- nettle=3.7.3=hbbd107a_1
|
73 |
+
- networkx=3.1=py311h06a4308_0
|
74 |
+
- numpy=1.26.2=py311h08b1b3b_0
|
75 |
+
- numpy-base=1.26.2=py311hf175353_0
|
76 |
+
- openh264=2.1.1=h4ff587b_0
|
77 |
+
- openjpeg=2.4.0=h3ad879b_0
|
78 |
+
- openssl=3.0.12=h7f8727e_0
|
79 |
+
- pycparser=2.21=pyhd3eb1b0_0
|
80 |
+
- pyopenssl=23.2.0=py311h06a4308_0
|
81 |
+
- pysocks=1.7.1=py311h06a4308_0
|
82 |
+
- python=3.11.5=h955ad1f_0
|
83 |
+
- pytorch-cuda=12.1=ha16c6d3_5
|
84 |
+
- pytorch-mutex=1.0=cuda
|
85 |
+
- pyyaml=6.0.1=py311h5eee18b_0
|
86 |
+
- readline=8.2=h5eee18b_0
|
87 |
+
- requests=2.31.0=py311h06a4308_0
|
88 |
+
- setuptools=68.2.2=py311h06a4308_0
|
89 |
+
- sqlite=3.41.2=h5eee18b_0
|
90 |
+
- sympy=1.12=py311h06a4308_0
|
91 |
+
- tbb=2021.8.0=hdb19cb5_0
|
92 |
+
- tk=8.6.12=h1ccaba5_0
|
93 |
+
- wheel=0.41.2=py311h06a4308_0
|
94 |
+
- xz=5.4.5=h5eee18b_0
|
95 |
+
- yaml=0.2.5=h7b6447c_0
|
96 |
+
- zlib=1.2.13=h5eee18b_0
|
97 |
+
- zstd=1.5.5=hc292b87_0
|
98 |
+
- pip:
|
99 |
+
- absl-py==2.0.0
|
100 |
+
- accelerate==0.29.3
|
101 |
+
- aiohttp==3.9.1
|
102 |
+
- aiosignal==1.3.1
|
103 |
+
- annotated-types==0.6.0
|
104 |
+
- anyio==4.2.0
|
105 |
+
- appdirs==1.4.4
|
106 |
+
- argon2-cffi==23.1.0
|
107 |
+
- argon2-cffi-bindings==21.2.0
|
108 |
+
- arrow==1.3.0
|
109 |
+
- asttokens==2.4.1
|
110 |
+
- astunparse==1.6.3
|
111 |
+
- async-lru==2.0.4
|
112 |
+
- attrs==23.1.0
|
113 |
+
- audioread==3.0.1
|
114 |
+
- babel==2.14.0
|
115 |
+
- beautifulsoup4==4.12.3
|
116 |
+
- bitsandbytes==0.43.1
|
117 |
+
- bleach==6.1.0
|
118 |
+
- cachetools==5.3.2
|
119 |
+
- chardet==5.2.0
|
120 |
+
- charset-normalizer==3.3.2
|
121 |
+
- click==8.1.7
|
122 |
+
- comm==0.2.1
|
123 |
+
- datasets==2.18.1.dev0
|
124 |
+
- debugpy==1.8.1
|
125 |
+
- decorator==5.1.1
|
126 |
+
- deepspeed==0.12.2
|
127 |
+
- defusedxml==0.7.1
|
128 |
+
- dill==0.3.7
|
129 |
+
- docker-pycreds==0.4.0
|
130 |
+
- docstring-parser==0.15
|
131 |
+
- einops==0.7.0
|
132 |
+
- evaluate==0.4.0
|
133 |
+
- executing==2.0.1
|
134 |
+
- fastjsonschema==2.19.1
|
135 |
+
- flatbuffers==23.5.26
|
136 |
+
- fqdn==1.5.1
|
137 |
+
- frozenlist==1.4.1
|
138 |
+
- fsspec==2023.10.0
|
139 |
+
- gast==0.5.4
|
140 |
+
- gitdb==4.0.11
|
141 |
+
- gitpython==3.1.40
|
142 |
+
- google-auth==2.26.1
|
143 |
+
- google-auth-oauthlib==1.2.0
|
144 |
+
- google-pasta==0.2.0
|
145 |
+
- grpcio==1.60.0
|
146 |
+
- h11==0.14.0
|
147 |
+
- h5py==3.10.0
|
148 |
+
- hf-transfer==0.1.5
|
149 |
+
- hjson==3.1.0
|
150 |
+
- httpcore==1.0.2
|
151 |
+
- httpx==0.26.0
|
152 |
+
- huggingface-hub==0.22.2
|
153 |
+
- idna==3.6
|
154 |
+
- ipdb==0.13.13
|
155 |
+
- ipykernel==6.29.2
|
156 |
+
- ipython==8.21.0
|
157 |
+
- isoduration==20.11.0
|
158 |
+
- jedi==0.19.1
|
159 |
+
- jiwer==3.0.3
|
160 |
+
- joblib==1.3.2
|
161 |
+
- json5==0.9.14
|
162 |
+
- jsonpointer==2.4
|
163 |
+
- jsonschema==4.21.1
|
164 |
+
- jsonschema-specifications==2023.12.1
|
165 |
+
- jupyter-client==8.6.0
|
166 |
+
- jupyter-core==5.7.1
|
167 |
+
- jupyter-events==0.9.0
|
168 |
+
- jupyter-lsp==2.2.2
|
169 |
+
- jupyter-server==2.12.5
|
170 |
+
- jupyter-server-terminals==0.5.2
|
171 |
+
- jupyterlab==4.1.1
|
172 |
+
- jupyterlab-pygments==0.3.0
|
173 |
+
- jupyterlab-server==2.25.2
|
174 |
+
- keras==2.15.0
|
175 |
+
- lazy-loader==0.3
|
176 |
+
- libclang==16.0.6
|
177 |
+
- librosa==0.10.1
|
178 |
+
- llvmlite==0.41.1
|
179 |
+
- markdown==3.5.1
|
180 |
+
- markdown-it-py==3.0.0
|
181 |
+
- matplotlib-inline==0.1.6
|
182 |
+
- mdurl==0.1.2
|
183 |
+
- mistune==3.0.2
|
184 |
+
- ml-dtypes==0.2.0
|
185 |
+
- msgpack==1.0.7
|
186 |
+
- multidict==6.0.4
|
187 |
+
- multiprocess==0.70.15
|
188 |
+
- nbclient==0.9.0
|
189 |
+
- nbconvert==7.16.0
|
190 |
+
- nbformat==5.9.2
|
191 |
+
- nest-asyncio==1.6.0
|
192 |
+
- ninja==1.11.1.1
|
193 |
+
- nltk==3.8.1
|
194 |
+
- notebook-shim==0.2.3
|
195 |
+
- numba==0.58.1
|
196 |
+
- nvidia-cublas-cu12==12.1.3.1
|
197 |
+
- nvidia-cuda-cupti-cu12==12.1.105
|
198 |
+
- nvidia-cuda-nvrtc-cu12==12.1.105
|
199 |
+
- nvidia-cuda-runtime-cu12==12.1.105
|
200 |
+
- nvidia-cudnn-cu12==8.9.2.26
|
201 |
+
- nvidia-cufft-cu12==11.0.2.54
|
202 |
+
- nvidia-curand-cu12==10.3.2.106
|
203 |
+
- nvidia-cusolver-cu12==11.4.5.107
|
204 |
+
- nvidia-cusparse-cu12==12.1.0.106
|
205 |
+
- nvidia-nccl-cu12==2.20.5
|
206 |
+
- nvidia-nvjitlink-cu12==12.3.101
|
207 |
+
- nvidia-nvtx-cu12==12.1.105
|
208 |
+
- oauthlib==3.2.2
|
209 |
+
- opt-einsum==3.3.0
|
210 |
+
- overrides==7.7.0
|
211 |
+
- packaging==23.2
|
212 |
+
- pandas==2.1.4
|
213 |
+
- pandocfilters==1.5.1
|
214 |
+
- parso==0.8.3
|
215 |
+
- peft==0.7.1
|
216 |
+
- pexpect==4.9.0
|
217 |
+
- pillow==10.2.0
|
218 |
+
- pip==24.0
|
219 |
+
- platformdirs==4.1.0
|
220 |
+
- pooch==1.8.0
|
221 |
+
- prometheus-client==0.19.0
|
222 |
+
- prompt-toolkit==3.0.43
|
223 |
+
- protobuf==3.20.2
|
224 |
+
- psutil==5.9.7
|
225 |
+
- ptyprocess==0.7.0
|
226 |
+
- pure-eval==0.2.2
|
227 |
+
- py-cpuinfo==9.0.0
|
228 |
+
- pyarrow==14.0.2
|
229 |
+
- pyarrow-hotfix==0.6
|
230 |
+
- pyasn1==0.5.1
|
231 |
+
- pyasn1-modules==0.3.0
|
232 |
+
- pydantic==2.6.0
|
233 |
+
- pydantic-core==2.16.1
|
234 |
+
- pygments==2.17.2
|
235 |
+
- pynvml==11.5.0
|
236 |
+
- python-dateutil==2.8.2
|
237 |
+
- python-json-logger==2.0.7
|
238 |
+
- pytorch-triton==3.0.0+989adb9a29
|
239 |
+
- pytz==2023.3.post1
|
240 |
+
- pyzmq==25.1.2
|
241 |
+
- rapidfuzz==3.6.1
|
242 |
+
- referencing==0.33.0
|
243 |
+
- regex==2023.12.25
|
244 |
+
- requests-oauthlib==1.3.1
|
245 |
+
- responses==0.18.0
|
246 |
+
- rfc3339-validator==0.1.4
|
247 |
+
- rfc3986-validator==0.1.1
|
248 |
+
- rich==13.7.0
|
249 |
+
- rpds-py==0.17.1
|
250 |
+
- rsa==4.9
|
251 |
+
- safetensors==0.4.1
|
252 |
+
- scikit-learn==1.3.2
|
253 |
+
- scipy==1.11.4
|
254 |
+
- send2trash==1.8.2
|
255 |
+
- sentencepiece==0.1.99
|
256 |
+
- sentry-sdk==1.39.1
|
257 |
+
- setproctitle==1.3.3
|
258 |
+
- shtab==1.6.5
|
259 |
+
- six==1.16.0
|
260 |
+
- smmap==5.0.1
|
261 |
+
- sniffio==1.3.0
|
262 |
+
- soundfile==0.12.1
|
263 |
+
- soupsieve==2.5
|
264 |
+
- soxr==0.3.7
|
265 |
+
- stack-data==0.6.3
|
266 |
+
- tensorboard==2.15.1
|
267 |
+
- tensorboard-data-server==0.7.2
|
268 |
+
- tensorflow-cpu==2.15.0.post1
|
269 |
+
- tensorflow-estimator==2.15.0
|
270 |
+
- tensorflow-io-gcs-filesystem==0.35.0
|
271 |
+
- termcolor==2.4.0
|
272 |
+
- terminado==0.18.0
|
273 |
+
- threadpoolctl==3.2.0
|
274 |
+
- tinycss2==1.2.1
|
275 |
+
- tokenizers==0.15.0
|
276 |
+
- torch==2.4.0.dev20240323+cu121
|
277 |
+
- torchaudio==2.2.0.dev20240323+cu121
|
278 |
+
- torchvision==0.19.0.dev20240323+cu121
|
279 |
+
- tornado==6.4
|
280 |
+
- tqdm==4.66.1
|
281 |
+
- traitlets==5.14.1
|
282 |
+
- transformers==4.39.0.dev0
|
283 |
+
- triton==2.2.0
|
284 |
+
- trl==0.8.6
|
285 |
+
- types-python-dateutil==2.8.19.20240106
|
286 |
+
- typing-extensions==4.9.0
|
287 |
+
- tyro==0.7.0
|
288 |
+
- tzdata==2023.3
|
289 |
+
- uri-template==1.3.0
|
290 |
+
- urllib3==2.1.0
|
291 |
+
- wandb==0.16.1
|
292 |
+
- wcwidth==0.2.13
|
293 |
+
- webcolors==1.13
|
294 |
+
- webencodings==0.5.1
|
295 |
+
- websocket-client==1.7.0
|
296 |
+
- werkzeug==3.0.1
|
297 |
+
- wrapt==1.14.1
|
298 |
+
- xxhash==3.4.1
|
299 |
+
- yarl==1.9.4
|
300 |
+
prefix: /fsx/sanchit/miniconda3/envs/venv
|
wandb/run-20240424_164324-xfbnm7qo/files/config.yaml
ADDED
@@ -0,0 +1,663 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
_wandb:
|
4 |
+
desc: null
|
5 |
+
value:
|
6 |
+
python_version: 3.11.5
|
7 |
+
cli_version: 0.16.1
|
8 |
+
framework: huggingface
|
9 |
+
huggingface_version: 4.40.0.dev0
|
10 |
+
is_jupyter_run: false
|
11 |
+
is_kaggle_kernel: false
|
12 |
+
start_time: 1713977004.542006
|
13 |
+
t:
|
14 |
+
1:
|
15 |
+
- 1
|
16 |
+
- 2
|
17 |
+
- 3
|
18 |
+
- 5
|
19 |
+
- 11
|
20 |
+
- 49
|
21 |
+
- 51
|
22 |
+
- 53
|
23 |
+
- 55
|
24 |
+
- 71
|
25 |
+
- 84
|
26 |
+
- 98
|
27 |
+
2:
|
28 |
+
- 1
|
29 |
+
- 2
|
30 |
+
- 3
|
31 |
+
- 5
|
32 |
+
- 11
|
33 |
+
- 49
|
34 |
+
- 51
|
35 |
+
- 53
|
36 |
+
- 55
|
37 |
+
- 71
|
38 |
+
- 84
|
39 |
+
- 98
|
40 |
+
3:
|
41 |
+
- 7
|
42 |
+
- 23
|
43 |
+
4: 3.11.5
|
44 |
+
5: 0.16.1
|
45 |
+
6: 4.40.0.dev0
|
46 |
+
8:
|
47 |
+
- 5
|
48 |
+
9:
|
49 |
+
1: transformers_trainer
|
50 |
+
13: linux-x86_64
|
51 |
+
m:
|
52 |
+
- 1: train/global_step
|
53 |
+
6:
|
54 |
+
- 3
|
55 |
+
- 1: train/loss
|
56 |
+
5: 1
|
57 |
+
6:
|
58 |
+
- 1
|
59 |
+
- 1: train/grad_norm
|
60 |
+
5: 1
|
61 |
+
6:
|
62 |
+
- 1
|
63 |
+
- 1: train/learning_rate
|
64 |
+
5: 1
|
65 |
+
6:
|
66 |
+
- 1
|
67 |
+
- 1: train/epoch
|
68 |
+
5: 1
|
69 |
+
6:
|
70 |
+
- 1
|
71 |
+
vocab_size:
|
72 |
+
desc: null
|
73 |
+
value: 32000
|
74 |
+
max_position_embeddings:
|
75 |
+
desc: null
|
76 |
+
value: 32768
|
77 |
+
hidden_size:
|
78 |
+
desc: null
|
79 |
+
value: 4096
|
80 |
+
intermediate_size:
|
81 |
+
desc: null
|
82 |
+
value: 14336
|
83 |
+
num_hidden_layers:
|
84 |
+
desc: null
|
85 |
+
value: 6
|
86 |
+
num_attention_heads:
|
87 |
+
desc: null
|
88 |
+
value: 32
|
89 |
+
sliding_window:
|
90 |
+
desc: null
|
91 |
+
value: 4096
|
92 |
+
num_key_value_heads:
|
93 |
+
desc: null
|
94 |
+
value: 8
|
95 |
+
hidden_act:
|
96 |
+
desc: null
|
97 |
+
value: silu
|
98 |
+
initializer_range:
|
99 |
+
desc: null
|
100 |
+
value: 0.02
|
101 |
+
rms_norm_eps:
|
102 |
+
desc: null
|
103 |
+
value: 1.0e-05
|
104 |
+
use_cache:
|
105 |
+
desc: null
|
106 |
+
value: false
|
107 |
+
rope_theta:
|
108 |
+
desc: null
|
109 |
+
value: 10000.0
|
110 |
+
attention_dropout:
|
111 |
+
desc: null
|
112 |
+
value: 0.0
|
113 |
+
return_dict:
|
114 |
+
desc: null
|
115 |
+
value: true
|
116 |
+
output_hidden_states:
|
117 |
+
desc: null
|
118 |
+
value: false
|
119 |
+
output_attentions:
|
120 |
+
desc: null
|
121 |
+
value: false
|
122 |
+
torchscript:
|
123 |
+
desc: null
|
124 |
+
value: false
|
125 |
+
torch_dtype:
|
126 |
+
desc: null
|
127 |
+
value: bfloat16
|
128 |
+
use_bfloat16:
|
129 |
+
desc: null
|
130 |
+
value: false
|
131 |
+
tf_legacy_loss:
|
132 |
+
desc: null
|
133 |
+
value: false
|
134 |
+
pruned_heads:
|
135 |
+
desc: null
|
136 |
+
value: {}
|
137 |
+
tie_word_embeddings:
|
138 |
+
desc: null
|
139 |
+
value: false
|
140 |
+
chunk_size_feed_forward:
|
141 |
+
desc: null
|
142 |
+
value: 0
|
143 |
+
is_encoder_decoder:
|
144 |
+
desc: null
|
145 |
+
value: false
|
146 |
+
is_decoder:
|
147 |
+
desc: null
|
148 |
+
value: false
|
149 |
+
cross_attention_hidden_size:
|
150 |
+
desc: null
|
151 |
+
value: null
|
152 |
+
add_cross_attention:
|
153 |
+
desc: null
|
154 |
+
value: false
|
155 |
+
tie_encoder_decoder:
|
156 |
+
desc: null
|
157 |
+
value: false
|
158 |
+
max_length:
|
159 |
+
desc: null
|
160 |
+
value: 20
|
161 |
+
min_length:
|
162 |
+
desc: null
|
163 |
+
value: 0
|
164 |
+
do_sample:
|
165 |
+
desc: null
|
166 |
+
value: false
|
167 |
+
early_stopping:
|
168 |
+
desc: null
|
169 |
+
value: false
|
170 |
+
num_beams:
|
171 |
+
desc: null
|
172 |
+
value: 1
|
173 |
+
num_beam_groups:
|
174 |
+
desc: null
|
175 |
+
value: 1
|
176 |
+
diversity_penalty:
|
177 |
+
desc: null
|
178 |
+
value: 0.0
|
179 |
+
temperature:
|
180 |
+
desc: null
|
181 |
+
value: 1.0
|
182 |
+
top_k:
|
183 |
+
desc: null
|
184 |
+
value: 50
|
185 |
+
top_p:
|
186 |
+
desc: null
|
187 |
+
value: 1.0
|
188 |
+
typical_p:
|
189 |
+
desc: null
|
190 |
+
value: 1.0
|
191 |
+
repetition_penalty:
|
192 |
+
desc: null
|
193 |
+
value: 1.0
|
194 |
+
length_penalty:
|
195 |
+
desc: null
|
196 |
+
value: 1.0
|
197 |
+
no_repeat_ngram_size:
|
198 |
+
desc: null
|
199 |
+
value: 0
|
200 |
+
encoder_no_repeat_ngram_size:
|
201 |
+
desc: null
|
202 |
+
value: 0
|
203 |
+
bad_words_ids:
|
204 |
+
desc: null
|
205 |
+
value: null
|
206 |
+
num_return_sequences:
|
207 |
+
desc: null
|
208 |
+
value: 1
|
209 |
+
output_scores:
|
210 |
+
desc: null
|
211 |
+
value: false
|
212 |
+
return_dict_in_generate:
|
213 |
+
desc: null
|
214 |
+
value: false
|
215 |
+
forced_bos_token_id:
|
216 |
+
desc: null
|
217 |
+
value: null
|
218 |
+
forced_eos_token_id:
|
219 |
+
desc: null
|
220 |
+
value: null
|
221 |
+
remove_invalid_values:
|
222 |
+
desc: null
|
223 |
+
value: false
|
224 |
+
exponential_decay_length_penalty:
|
225 |
+
desc: null
|
226 |
+
value: null
|
227 |
+
suppress_tokens:
|
228 |
+
desc: null
|
229 |
+
value: null
|
230 |
+
begin_suppress_tokens:
|
231 |
+
desc: null
|
232 |
+
value: null
|
233 |
+
architectures:
|
234 |
+
desc: null
|
235 |
+
value:
|
236 |
+
- MistralForCausalLM
|
237 |
+
finetuning_task:
|
238 |
+
desc: null
|
239 |
+
value: null
|
240 |
+
id2label:
|
241 |
+
desc: null
|
242 |
+
value:
|
243 |
+
'0': LABEL_0
|
244 |
+
'1': LABEL_1
|
245 |
+
label2id:
|
246 |
+
desc: null
|
247 |
+
value:
|
248 |
+
LABEL_0: 0
|
249 |
+
LABEL_1: 1
|
250 |
+
tokenizer_class:
|
251 |
+
desc: null
|
252 |
+
value: null
|
253 |
+
prefix:
|
254 |
+
desc: null
|
255 |
+
value: null
|
256 |
+
bos_token_id:
|
257 |
+
desc: null
|
258 |
+
value: 1
|
259 |
+
pad_token_id:
|
260 |
+
desc: null
|
261 |
+
value: null
|
262 |
+
eos_token_id:
|
263 |
+
desc: null
|
264 |
+
value: 2
|
265 |
+
sep_token_id:
|
266 |
+
desc: null
|
267 |
+
value: null
|
268 |
+
decoder_start_token_id:
|
269 |
+
desc: null
|
270 |
+
value: null
|
271 |
+
task_specific_params:
|
272 |
+
desc: null
|
273 |
+
value: null
|
274 |
+
problem_type:
|
275 |
+
desc: null
|
276 |
+
value: null
|
277 |
+
_name_or_path:
|
278 |
+
desc: null
|
279 |
+
value: sanchit-gandhi/Mistral-7B-v0.1-6-layer
|
280 |
+
transformers_version:
|
281 |
+
desc: null
|
282 |
+
value: 4.40.0.dev0
|
283 |
+
model_type:
|
284 |
+
desc: null
|
285 |
+
value: mistral
|
286 |
+
output_dir:
|
287 |
+
desc: null
|
288 |
+
value: ./
|
289 |
+
overwrite_output_dir:
|
290 |
+
desc: null
|
291 |
+
value: true
|
292 |
+
do_train:
|
293 |
+
desc: null
|
294 |
+
value: false
|
295 |
+
do_eval:
|
296 |
+
desc: null
|
297 |
+
value: true
|
298 |
+
do_predict:
|
299 |
+
desc: null
|
300 |
+
value: false
|
301 |
+
evaluation_strategy:
|
302 |
+
desc: null
|
303 |
+
value: steps
|
304 |
+
prediction_loss_only:
|
305 |
+
desc: null
|
306 |
+
value: false
|
307 |
+
per_device_train_batch_size:
|
308 |
+
desc: null
|
309 |
+
value: 32
|
310 |
+
per_device_eval_batch_size:
|
311 |
+
desc: null
|
312 |
+
value: 32
|
313 |
+
per_gpu_train_batch_size:
|
314 |
+
desc: null
|
315 |
+
value: null
|
316 |
+
per_gpu_eval_batch_size:
|
317 |
+
desc: null
|
318 |
+
value: null
|
319 |
+
gradient_accumulation_steps:
|
320 |
+
desc: null
|
321 |
+
value: 1
|
322 |
+
eval_accumulation_steps:
|
323 |
+
desc: null
|
324 |
+
value: null
|
325 |
+
eval_delay:
|
326 |
+
desc: null
|
327 |
+
value: 0
|
328 |
+
learning_rate:
|
329 |
+
desc: null
|
330 |
+
value: 0.0001
|
331 |
+
weight_decay:
|
332 |
+
desc: null
|
333 |
+
value: 0.0
|
334 |
+
adam_beta1:
|
335 |
+
desc: null
|
336 |
+
value: 0.9
|
337 |
+
adam_beta2:
|
338 |
+
desc: null
|
339 |
+
value: 0.999
|
340 |
+
adam_epsilon:
|
341 |
+
desc: null
|
342 |
+
value: 1.0e-08
|
343 |
+
max_grad_norm:
|
344 |
+
desc: null
|
345 |
+
value: 1.0
|
346 |
+
num_train_epochs:
|
347 |
+
desc: null
|
348 |
+
value: 3.0
|
349 |
+
max_steps:
|
350 |
+
desc: null
|
351 |
+
value: 20000
|
352 |
+
lr_scheduler_type:
|
353 |
+
desc: null
|
354 |
+
value: linear
|
355 |
+
lr_scheduler_kwargs:
|
356 |
+
desc: null
|
357 |
+
value: {}
|
358 |
+
warmup_ratio:
|
359 |
+
desc: null
|
360 |
+
value: 0.0
|
361 |
+
warmup_steps:
|
362 |
+
desc: null
|
363 |
+
value: 500
|
364 |
+
log_level:
|
365 |
+
desc: null
|
366 |
+
value: info
|
367 |
+
log_level_replica:
|
368 |
+
desc: null
|
369 |
+
value: warning
|
370 |
+
log_on_each_node:
|
371 |
+
desc: null
|
372 |
+
value: true
|
373 |
+
logging_dir:
|
374 |
+
desc: null
|
375 |
+
value: ./runs/Apr24_16-42-31_ip-26-0-162-233
|
376 |
+
logging_strategy:
|
377 |
+
desc: null
|
378 |
+
value: steps
|
379 |
+
logging_first_step:
|
380 |
+
desc: null
|
381 |
+
value: true
|
382 |
+
logging_steps:
|
383 |
+
desc: null
|
384 |
+
value: 25
|
385 |
+
logging_nan_inf_filter:
|
386 |
+
desc: null
|
387 |
+
value: true
|
388 |
+
save_strategy:
|
389 |
+
desc: null
|
390 |
+
value: steps
|
391 |
+
save_steps:
|
392 |
+
desc: null
|
393 |
+
value: 500
|
394 |
+
save_total_limit:
|
395 |
+
desc: null
|
396 |
+
value: 5000
|
397 |
+
save_safetensors:
|
398 |
+
desc: null
|
399 |
+
value: true
|
400 |
+
save_on_each_node:
|
401 |
+
desc: null
|
402 |
+
value: false
|
403 |
+
save_only_model:
|
404 |
+
desc: null
|
405 |
+
value: false
|
406 |
+
no_cuda:
|
407 |
+
desc: null
|
408 |
+
value: false
|
409 |
+
use_cpu:
|
410 |
+
desc: null
|
411 |
+
value: false
|
412 |
+
use_mps_device:
|
413 |
+
desc: null
|
414 |
+
value: false
|
415 |
+
seed:
|
416 |
+
desc: null
|
417 |
+
value: 42
|
418 |
+
data_seed:
|
419 |
+
desc: null
|
420 |
+
value: null
|
421 |
+
jit_mode_eval:
|
422 |
+
desc: null
|
423 |
+
value: false
|
424 |
+
use_ipex:
|
425 |
+
desc: null
|
426 |
+
value: false
|
427 |
+
bf16:
|
428 |
+
desc: null
|
429 |
+
value: true
|
430 |
+
fp16:
|
431 |
+
desc: null
|
432 |
+
value: false
|
433 |
+
fp16_opt_level:
|
434 |
+
desc: null
|
435 |
+
value: O1
|
436 |
+
half_precision_backend:
|
437 |
+
desc: null
|
438 |
+
value: auto
|
439 |
+
bf16_full_eval:
|
440 |
+
desc: null
|
441 |
+
value: false
|
442 |
+
fp16_full_eval:
|
443 |
+
desc: null
|
444 |
+
value: false
|
445 |
+
tf32:
|
446 |
+
desc: null
|
447 |
+
value: null
|
448 |
+
local_rank:
|
449 |
+
desc: null
|
450 |
+
value: 0
|
451 |
+
ddp_backend:
|
452 |
+
desc: null
|
453 |
+
value: null
|
454 |
+
tpu_num_cores:
|
455 |
+
desc: null
|
456 |
+
value: null
|
457 |
+
tpu_metrics_debug:
|
458 |
+
desc: null
|
459 |
+
value: false
|
460 |
+
debug:
|
461 |
+
desc: null
|
462 |
+
value: []
|
463 |
+
dataloader_drop_last:
|
464 |
+
desc: null
|
465 |
+
value: false
|
466 |
+
eval_steps:
|
467 |
+
desc: null
|
468 |
+
value: 5000
|
469 |
+
dataloader_num_workers:
|
470 |
+
desc: null
|
471 |
+
value: 0
|
472 |
+
dataloader_prefetch_factor:
|
473 |
+
desc: null
|
474 |
+
value: null
|
475 |
+
past_index:
|
476 |
+
desc: null
|
477 |
+
value: -1
|
478 |
+
run_name:
|
479 |
+
desc: null
|
480 |
+
value: ./
|
481 |
+
disable_tqdm:
|
482 |
+
desc: null
|
483 |
+
value: false
|
484 |
+
remove_unused_columns:
|
485 |
+
desc: null
|
486 |
+
value: true
|
487 |
+
label_names:
|
488 |
+
desc: null
|
489 |
+
value: null
|
490 |
+
load_best_model_at_end:
|
491 |
+
desc: null
|
492 |
+
value: false
|
493 |
+
metric_for_best_model:
|
494 |
+
desc: null
|
495 |
+
value: null
|
496 |
+
greater_is_better:
|
497 |
+
desc: null
|
498 |
+
value: null
|
499 |
+
ignore_data_skip:
|
500 |
+
desc: null
|
501 |
+
value: false
|
502 |
+
fsdp:
|
503 |
+
desc: null
|
504 |
+
value: []
|
505 |
+
fsdp_min_num_params:
|
506 |
+
desc: null
|
507 |
+
value: 0
|
508 |
+
fsdp_config:
|
509 |
+
desc: null
|
510 |
+
value:
|
511 |
+
min_num_params: 0
|
512 |
+
xla: false
|
513 |
+
xla_fsdp_v2: false
|
514 |
+
xla_fsdp_grad_ckpt: false
|
515 |
+
fsdp_transformer_layer_cls_to_wrap:
|
516 |
+
desc: null
|
517 |
+
value: null
|
518 |
+
accelerator_config:
|
519 |
+
desc: null
|
520 |
+
value:
|
521 |
+
split_batches: false
|
522 |
+
dispatch_batches: null
|
523 |
+
even_batches: true
|
524 |
+
use_seedable_sampler: true
|
525 |
+
gradient_accumulation_kwargs: null
|
526 |
+
deepspeed:
|
527 |
+
desc: null
|
528 |
+
value: null
|
529 |
+
label_smoothing_factor:
|
530 |
+
desc: null
|
531 |
+
value: 0.0
|
532 |
+
optim:
|
533 |
+
desc: null
|
534 |
+
value: adamw_torch
|
535 |
+
optim_args:
|
536 |
+
desc: null
|
537 |
+
value: null
|
538 |
+
adafactor:
|
539 |
+
desc: null
|
540 |
+
value: false
|
541 |
+
group_by_length:
|
542 |
+
desc: null
|
543 |
+
value: false
|
544 |
+
length_column_name:
|
545 |
+
desc: null
|
546 |
+
value: length
|
547 |
+
report_to:
|
548 |
+
desc: null
|
549 |
+
value:
|
550 |
+
- tensorboard
|
551 |
+
- wandb
|
552 |
+
ddp_find_unused_parameters:
|
553 |
+
desc: null
|
554 |
+
value: null
|
555 |
+
ddp_bucket_cap_mb:
|
556 |
+
desc: null
|
557 |
+
value: null
|
558 |
+
ddp_broadcast_buffers:
|
559 |
+
desc: null
|
560 |
+
value: null
|
561 |
+
dataloader_pin_memory:
|
562 |
+
desc: null
|
563 |
+
value: true
|
564 |
+
dataloader_persistent_workers:
|
565 |
+
desc: null
|
566 |
+
value: false
|
567 |
+
skip_memory_metrics:
|
568 |
+
desc: null
|
569 |
+
value: true
|
570 |
+
use_legacy_prediction_loop:
|
571 |
+
desc: null
|
572 |
+
value: false
|
573 |
+
push_to_hub:
|
574 |
+
desc: null
|
575 |
+
value: true
|
576 |
+
resume_from_checkpoint:
|
577 |
+
desc: null
|
578 |
+
value: null
|
579 |
+
hub_model_id:
|
580 |
+
desc: null
|
581 |
+
value: null
|
582 |
+
hub_strategy:
|
583 |
+
desc: null
|
584 |
+
value: every_save
|
585 |
+
hub_token:
|
586 |
+
desc: null
|
587 |
+
value: <HUB_TOKEN>
|
588 |
+
hub_private_repo:
|
589 |
+
desc: null
|
590 |
+
value: false
|
591 |
+
hub_always_push:
|
592 |
+
desc: null
|
593 |
+
value: false
|
594 |
+
gradient_checkpointing:
|
595 |
+
desc: null
|
596 |
+
value: true
|
597 |
+
gradient_checkpointing_kwargs:
|
598 |
+
desc: null
|
599 |
+
value:
|
600 |
+
use_reentrant: false
|
601 |
+
include_inputs_for_metrics:
|
602 |
+
desc: null
|
603 |
+
value: false
|
604 |
+
fp16_backend:
|
605 |
+
desc: null
|
606 |
+
value: auto
|
607 |
+
push_to_hub_model_id:
|
608 |
+
desc: null
|
609 |
+
value: null
|
610 |
+
push_to_hub_organization:
|
611 |
+
desc: null
|
612 |
+
value: null
|
613 |
+
push_to_hub_token:
|
614 |
+
desc: null
|
615 |
+
value: <PUSH_TO_HUB_TOKEN>
|
616 |
+
mp_parameters:
|
617 |
+
desc: null
|
618 |
+
value: ''
|
619 |
+
auto_find_batch_size:
|
620 |
+
desc: null
|
621 |
+
value: false
|
622 |
+
full_determinism:
|
623 |
+
desc: null
|
624 |
+
value: false
|
625 |
+
torchdynamo:
|
626 |
+
desc: null
|
627 |
+
value: null
|
628 |
+
ray_scope:
|
629 |
+
desc: null
|
630 |
+
value: last
|
631 |
+
ddp_timeout:
|
632 |
+
desc: null
|
633 |
+
value: 7200
|
634 |
+
torch_compile:
|
635 |
+
desc: null
|
636 |
+
value: false
|
637 |
+
torch_compile_backend:
|
638 |
+
desc: null
|
639 |
+
value: null
|
640 |
+
torch_compile_mode:
|
641 |
+
desc: null
|
642 |
+
value: null
|
643 |
+
dispatch_batches:
|
644 |
+
desc: null
|
645 |
+
value: null
|
646 |
+
split_batches:
|
647 |
+
desc: null
|
648 |
+
value: null
|
649 |
+
include_tokens_per_second:
|
650 |
+
desc: null
|
651 |
+
value: false
|
652 |
+
include_num_input_tokens_seen:
|
653 |
+
desc: null
|
654 |
+
value: false
|
655 |
+
neftune_noise_alpha:
|
656 |
+
desc: null
|
657 |
+
value: null
|
658 |
+
optim_target_modules:
|
659 |
+
desc: null
|
660 |
+
value: null
|
661 |
+
max_seq_length:
|
662 |
+
desc: null
|
663 |
+
value: 2048
|
wandb/run-20240424_164324-xfbnm7qo/files/output.log
ADDED
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
0%| | 0/20000 [00:00<?, ?it/s]/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
|
2 |
+
warnings.warn(
|
3 |
+
0%| | 1/20000 [00:03<16:45:22, 3.02s/it]
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
0%| | 25/20000 [00:49<10:30:28, 1.89s/it]
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
0%|▏ | 50/20000 [01:36<10:30:25, 1.90s/it]
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
0%|▎ | 75/20000 [02:23<10:25:09, 1.88s/it]
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
0%|▍ | 100/20000 [03:10<10:21:12, 1.87s/it]
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
1%|▍ | 125/20000 [03:57<10:17:00, 1.86s/it]
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
1%|▌ | 150/20000 [04:43<10:12:27, 1.85s/it]
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
1%|▋ | 174/20000 [05:28<10:11:10, 1.85s/it]
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
+
|
196 |
+
|
197 |
+
1%|▊ | 199/20000 [06:14<10:07:08, 1.84s/it]
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
|
202 |
+
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
|
221 |
+
|
222 |
+
1%|▊ | 225/20000 [07:02<10:09:09, 1.85s/it]
|
223 |
+
|
224 |
+
|
225 |
+
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
|
238 |
+
|
239 |
+
|
240 |
+
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
|
245 |
+
|
246 |
+
1%|▉ | 250/20000 [07:48<10:05:59, 1.84s/it]
|
247 |
+
|
248 |
+
|
249 |
+
|
250 |
+
|
251 |
+
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
|
265 |
+
|
266 |
+
|
267 |
+
|
268 |
+
|
269 |
+
1%|█ | 274/20000 [08:32<10:02:25, 1.83s/it]
|
270 |
+
|
271 |
+
|
272 |
+
|
273 |
+
|
274 |
+
|
275 |
+
|
276 |
+
|
277 |
+
|
278 |
+
|
279 |
+
|
280 |
+
|
281 |
+
|
282 |
+
|
283 |
+
|
284 |
+
|
285 |
+
|
286 |
+
|
287 |
+
|
288 |
+
|
289 |
+
|
290 |
+
|
291 |
+
|
292 |
+
|
293 |
+
1%|█▏ | 299/20000 [09:18<10:01:03, 1.83s/it]
|
294 |
+
|
295 |
+
|
296 |
+
|
297 |
+
|
298 |
+
|
299 |
+
|
300 |
+
|
301 |
+
|
302 |
+
|
303 |
+
|
304 |
+
|
305 |
+
|
306 |
+
|
307 |
+
|
308 |
+
|
309 |
+
|
310 |
+
|
311 |
+
|
312 |
+
|
313 |
+
|
314 |
+
|
315 |
+
|
316 |
+
|
317 |
+
2%|█▏ | 324/20000 [10:04<10:02:02, 1.84s/it]
|
318 |
+
|
319 |
+
|
320 |
+
|
321 |
+
|
322 |
+
|
323 |
+
|
324 |
+
|
325 |
+
|
326 |
+
|
327 |
+
|
328 |
+
|
329 |
+
|
330 |
+
|
331 |
+
|
332 |
+
|
333 |
+
|
334 |
+
|
335 |
+
|
336 |
+
|
337 |
+
|
338 |
+
|
339 |
+
|
340 |
+
|
341 |
+
2%|█▎ | 350/20000 [10:52<9:54:57, 1.82s/it]
|
342 |
+
|
343 |
+
|
344 |
+
|
345 |
+
|
346 |
+
|
347 |
+
|
348 |
+
|
349 |
+
|
350 |
+
|
351 |
+
|
352 |
+
|
353 |
+
|
354 |
+
|
355 |
+
|
356 |
+
|
357 |
+
|
358 |
+
|
359 |
+
|
360 |
+
|
361 |
+
|
362 |
+
|
363 |
+
|
364 |
+
|
365 |
+
2%|█▍ | 375/20000 [11:37<10:00:16, 1.84s/it]
|
366 |
+
|
367 |
+
|
368 |
+
|
369 |
+
|
370 |
+
|
371 |
+
|
372 |
+
|
373 |
+
|
374 |
+
|
375 |
+
|
376 |
+
|
377 |
+
|
378 |
+
|
379 |
+
|
380 |
+
|
381 |
+
|
382 |
+
|
383 |
+
|
384 |
+
|
385 |
+
|
386 |
+
|
387 |
+
|
388 |
+
|
389 |
+
2%|█▌ | 400/20000 [12:23<9:56:59, 1.83s/it]
|
390 |
+
|
391 |
+
|
392 |
+
|
393 |
+
|
394 |
+
|
395 |
+
|
396 |
+
|
397 |
+
|
398 |
+
|
399 |
+
|
400 |
+
|
401 |
+
|
402 |
+
|
403 |
+
|
404 |
+
|
405 |
+
|
406 |
+
|
407 |
+
|
408 |
+
|
409 |
+
|
410 |
+
|
411 |
+
|
412 |
+
|
413 |
+
2%|█▋ | 425/20000 [13:09<9:54:49, 1.82s/it]
|
414 |
+
|
415 |
+
|
416 |
+
|
417 |
+
|
418 |
+
|
419 |
+
|
420 |
+
|
421 |
+
|
422 |
+
|
423 |
+
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
|
429 |
+
|
430 |
+
|
431 |
+
|
432 |
+
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
|
437 |
+
2%|█▋ | 450/20000 [13:54<9:56:31, 1.83s/it]
|
438 |
+
|
439 |
+
|
440 |
+
|
441 |
+
|
442 |
+
|
443 |
+
|
444 |
+
|
445 |
+
|
446 |
+
|
447 |
+
|
448 |
+
|
449 |
+
|
450 |
+
|
451 |
+
|
452 |
+
|
453 |
+
|
454 |
+
|
455 |
+
|
456 |
+
|
457 |
+
|
458 |
+
|
459 |
+
|
460 |
+
2%|█▊ | 474/20000 [14:38<9:55:46, 1.83s/it]
|
461 |
+
|
462 |
+
|
463 |
+
|
464 |
+
|
465 |
+
|
466 |
+
|
467 |
+
|
468 |
+
|
469 |
+
|
470 |
+
|
471 |
+
|
472 |
+
|
473 |
+
|
474 |
+
|
475 |
+
|
476 |
+
|
477 |
+
|
478 |
+
|
479 |
+
|
480 |
+
|
481 |
+
|
482 |
+
|
483 |
+
2%|█▉ | 500/20000 [15:26<9:52:31, 1.82s/it][INFO|trainer.py:3304] 2024-04-24 16:58:56,780 >> Saving model checkpoint to ./checkpoint-500
|
484 |
+
[INFO|configuration_utils.py:471] 2024-04-24 16:58:56,784 >> Configuration saved in ./checkpoint-500/config.json
|
485 |
+
[INFO|configuration_utils.py:697] 2024-04-24 16:58:56,788 >> Configuration saved in ./checkpoint-500/generation_config.json
|
486 |
+
{'loss': 2.0773, 'grad_norm': 4.6875, 'learning_rate': 0.0001, 'epoch': 0.12}
|
487 |
+
[INFO|modeling_utils.py:2590] 2024-04-24 16:59:01,066 >> Model weights saved in ./checkpoint-500/model.safetensors
|
488 |
+
[INFO|tokenization_utils_base.py:2488] 2024-04-24 16:59:01,079 >> tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
|
489 |
+
[INFO|tokenization_utils_base.py:2497] 2024-04-24 16:59:01,081 >> Special tokens file saved in ./checkpoint-500/special_tokens_map.json
|
490 |
+
[INFO|tokenization_utils_base.py:2488] 2024-04-24 16:59:11,382 >> tokenizer config file saved in ./tokenizer_config.json
|
491 |
+
[INFO|tokenization_utils_base.py:2497] 2024-04-24 16:59:11,384 >> Special tokens file saved in ./special_tokens_map.json
|
492 |
+
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
|
493 |
+
warnings.warn(
|
494 |
+
|
495 |
+
|
496 |
+
|
497 |
+
|
498 |
+
|
499 |
+
|
500 |
+
|
501 |
+
|
502 |
+
|
503 |
+
|
504 |
+
|
505 |
+
|
506 |
+
|
507 |
+
|
508 |
+
|
509 |
+
|
510 |
+
|
511 |
+
|
512 |
+
|
513 |
+
|
514 |
+
|
515 |
+
3%|██ | 524/20000 [16:24<9:52:57, 1.83s/it]
|
516 |
+
|
517 |
+
|
518 |
+
|
519 |
+
|
520 |
+
|
521 |
+
|
522 |
+
|
wandb/run-20240424_164324-xfbnm7qo/files/requirements.txt
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.0.0
|
2 |
+
accelerate==0.29.3
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
anyio==4.2.0
|
7 |
+
appdirs==1.4.4
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
argon2-cffi==23.1.0
|
10 |
+
arrow==1.3.0
|
11 |
+
asttokens==2.4.1
|
12 |
+
astunparse==1.6.3
|
13 |
+
async-lru==2.0.4
|
14 |
+
attrs==23.1.0
|
15 |
+
audioread==3.0.1
|
16 |
+
babel==2.14.0
|
17 |
+
beautifulsoup4==4.12.3
|
18 |
+
bitsandbytes==0.43.1
|
19 |
+
bleach==6.1.0
|
20 |
+
brotli==1.0.9
|
21 |
+
cachetools==5.3.2
|
22 |
+
certifi==2023.11.17
|
23 |
+
cffi==1.16.0
|
24 |
+
chardet==5.2.0
|
25 |
+
charset-normalizer==2.0.4
|
26 |
+
click==8.1.7
|
27 |
+
comm==0.2.1
|
28 |
+
cryptography==41.0.7
|
29 |
+
datasets==2.18.1.dev0
|
30 |
+
debugpy==1.8.1
|
31 |
+
decorator==5.1.1
|
32 |
+
deepspeed==0.12.2
|
33 |
+
defusedxml==0.7.1
|
34 |
+
dill==0.3.7
|
35 |
+
docker-pycreds==0.4.0
|
36 |
+
docstring-parser==0.15
|
37 |
+
einops==0.7.0
|
38 |
+
evaluate==0.4.0
|
39 |
+
executing==2.0.1
|
40 |
+
fastjsonschema==2.19.1
|
41 |
+
filelock==3.13.1
|
42 |
+
flatbuffers==23.5.26
|
43 |
+
fqdn==1.5.1
|
44 |
+
frozenlist==1.4.1
|
45 |
+
fsspec==2023.10.0
|
46 |
+
gast==0.5.4
|
47 |
+
gitdb==4.0.11
|
48 |
+
gitpython==3.1.40
|
49 |
+
gmpy2==2.1.2
|
50 |
+
google-auth-oauthlib==1.2.0
|
51 |
+
google-auth==2.26.1
|
52 |
+
google-pasta==0.2.0
|
53 |
+
grpcio==1.60.0
|
54 |
+
h11==0.14.0
|
55 |
+
h5py==3.10.0
|
56 |
+
hf-transfer==0.1.5
|
57 |
+
hjson==3.1.0
|
58 |
+
httpcore==1.0.2
|
59 |
+
httpx==0.26.0
|
60 |
+
huggingface-hub==0.22.2
|
61 |
+
idna==3.4
|
62 |
+
ipdb==0.13.13
|
63 |
+
ipykernel==6.29.2
|
64 |
+
ipython==8.21.0
|
65 |
+
isoduration==20.11.0
|
66 |
+
jedi==0.19.1
|
67 |
+
jinja2==3.1.2
|
68 |
+
jiwer==3.0.3
|
69 |
+
joblib==1.3.2
|
70 |
+
json5==0.9.14
|
71 |
+
jsonpointer==2.4
|
72 |
+
jsonschema-specifications==2023.12.1
|
73 |
+
jsonschema==4.21.1
|
74 |
+
jupyter-client==8.6.0
|
75 |
+
jupyter-core==5.7.1
|
76 |
+
jupyter-events==0.9.0
|
77 |
+
jupyter-lsp==2.2.2
|
78 |
+
jupyter-server-terminals==0.5.2
|
79 |
+
jupyter-server==2.12.5
|
80 |
+
jupyterlab-pygments==0.3.0
|
81 |
+
jupyterlab-server==2.25.2
|
82 |
+
jupyterlab==4.1.1
|
83 |
+
keras==2.15.0
|
84 |
+
lazy-loader==0.3
|
85 |
+
libclang==16.0.6
|
86 |
+
librosa==0.10.1
|
87 |
+
llvmlite==0.41.1
|
88 |
+
markdown-it-py==3.0.0
|
89 |
+
markdown==3.5.1
|
90 |
+
markupsafe==2.1.1
|
91 |
+
matplotlib-inline==0.1.6
|
92 |
+
mdurl==0.1.2
|
93 |
+
mistune==3.0.2
|
94 |
+
mkl-fft==1.3.8
|
95 |
+
mkl-random==1.2.4
|
96 |
+
mkl-service==2.4.0
|
97 |
+
ml-dtypes==0.2.0
|
98 |
+
mpmath==1.3.0
|
99 |
+
msgpack==1.0.7
|
100 |
+
multidict==6.0.4
|
101 |
+
multiprocess==0.70.15
|
102 |
+
nbclient==0.9.0
|
103 |
+
nbconvert==7.16.0
|
104 |
+
nbformat==5.9.2
|
105 |
+
nest-asyncio==1.6.0
|
106 |
+
networkx==3.1
|
107 |
+
ninja==1.11.1.1
|
108 |
+
nltk==3.8.1
|
109 |
+
notebook-shim==0.2.3
|
110 |
+
numba==0.58.1
|
111 |
+
numpy==1.26.2
|
112 |
+
nvidia-cublas-cu12==12.1.3.1
|
113 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
114 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
115 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
116 |
+
nvidia-cudnn-cu12==8.9.2.26
|
117 |
+
nvidia-cufft-cu12==11.0.2.54
|
118 |
+
nvidia-curand-cu12==10.3.2.106
|
119 |
+
nvidia-cusolver-cu12==11.4.5.107
|
120 |
+
nvidia-cusparse-cu12==12.1.0.106
|
121 |
+
nvidia-nccl-cu12==2.20.5
|
122 |
+
nvidia-nvjitlink-cu12==12.3.101
|
123 |
+
nvidia-nvtx-cu12==12.1.105
|
124 |
+
oauthlib==3.2.2
|
125 |
+
opt-einsum==3.3.0
|
126 |
+
overrides==7.7.0
|
127 |
+
packaging==23.2
|
128 |
+
pandas==2.1.4
|
129 |
+
pandocfilters==1.5.1
|
130 |
+
parso==0.8.3
|
131 |
+
peft==0.7.1
|
132 |
+
pexpect==4.9.0
|
133 |
+
pillow==10.2.0
|
134 |
+
pip==24.0
|
135 |
+
platformdirs==4.1.0
|
136 |
+
pooch==1.8.0
|
137 |
+
prometheus-client==0.19.0
|
138 |
+
prompt-toolkit==3.0.43
|
139 |
+
protobuf==3.20.2
|
140 |
+
psutil==5.9.7
|
141 |
+
ptyprocess==0.7.0
|
142 |
+
pure-eval==0.2.2
|
143 |
+
py-cpuinfo==9.0.0
|
144 |
+
pyarrow-hotfix==0.6
|
145 |
+
pyarrow==14.0.2
|
146 |
+
pyasn1-modules==0.3.0
|
147 |
+
pyasn1==0.5.1
|
148 |
+
pycparser==2.21
|
149 |
+
pydantic-core==2.16.1
|
150 |
+
pydantic==2.6.0
|
151 |
+
pygments==2.17.2
|
152 |
+
pynvml==11.5.0
|
153 |
+
pyopenssl==23.2.0
|
154 |
+
pysocks==1.7.1
|
155 |
+
python-dateutil==2.8.2
|
156 |
+
python-json-logger==2.0.7
|
157 |
+
pytorch-triton==3.0.0+989adb9a29
|
158 |
+
pytz==2023.3.post1
|
159 |
+
pyyaml==6.0.1
|
160 |
+
pyzmq==25.1.2
|
161 |
+
rapidfuzz==3.6.1
|
162 |
+
referencing==0.33.0
|
163 |
+
regex==2023.12.25
|
164 |
+
requests-oauthlib==1.3.1
|
165 |
+
requests==2.31.0
|
166 |
+
responses==0.18.0
|
167 |
+
rfc3339-validator==0.1.4
|
168 |
+
rfc3986-validator==0.1.1
|
169 |
+
rich==13.7.0
|
170 |
+
rpds-py==0.17.1
|
171 |
+
rsa==4.9
|
172 |
+
safetensors==0.4.1
|
173 |
+
scikit-learn==1.3.2
|
174 |
+
scipy==1.11.4
|
175 |
+
send2trash==1.8.2
|
176 |
+
sentencepiece==0.1.99
|
177 |
+
sentry-sdk==1.39.1
|
178 |
+
setproctitle==1.3.3
|
179 |
+
setuptools==68.2.2
|
180 |
+
shtab==1.6.5
|
181 |
+
six==1.16.0
|
182 |
+
smmap==5.0.1
|
183 |
+
sniffio==1.3.0
|
184 |
+
soundfile==0.12.1
|
185 |
+
soupsieve==2.5
|
186 |
+
soxr==0.3.7
|
187 |
+
stack-data==0.6.3
|
188 |
+
sympy==1.12
|
189 |
+
tensorboard-data-server==0.7.2
|
190 |
+
tensorboard==2.15.1
|
191 |
+
tensorflow-cpu==2.15.0.post1
|
192 |
+
tensorflow-estimator==2.15.0
|
193 |
+
tensorflow-io-gcs-filesystem==0.35.0
|
194 |
+
termcolor==2.4.0
|
195 |
+
terminado==0.18.0
|
196 |
+
threadpoolctl==3.2.0
|
197 |
+
tinycss2==1.2.1
|
198 |
+
tokenizers==0.15.0
|
199 |
+
torch==2.4.0.dev20240323+cu121
|
200 |
+
torchaudio==2.2.0.dev20240323+cu121
|
201 |
+
torchvision==0.19.0.dev20240323+cu121
|
202 |
+
tornado==6.4
|
203 |
+
tqdm==4.66.1
|
204 |
+
traitlets==5.14.1
|
205 |
+
transformers==4.39.0.dev0
|
206 |
+
triton==2.2.0
|
207 |
+
trl==0.8.6
|
208 |
+
types-python-dateutil==2.8.19.20240106
|
209 |
+
typing-extensions==4.10.0
|
210 |
+
tyro==0.7.0
|
211 |
+
tzdata==2023.3
|
212 |
+
uri-template==1.3.0
|
213 |
+
urllib3==1.26.18
|
214 |
+
wandb==0.16.1
|
215 |
+
wcwidth==0.2.13
|
216 |
+
webcolors==1.13
|
217 |
+
webencodings==0.5.1
|
218 |
+
websocket-client==1.7.0
|
219 |
+
werkzeug==3.0.1
|
220 |
+
wheel==0.41.2
|
221 |
+
wrapt==1.14.1
|
222 |
+
xxhash==3.4.1
|
223 |
+
yarl==1.9.4
|
wandb/run-20240424_164324-xfbnm7qo/files/wandb-metadata.json
ADDED
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
|
3 |
+
"python": "3.11.5",
|
4 |
+
"heartbeatAt": "2024-04-24T16:43:25.058035",
|
5 |
+
"startedAt": "2024-04-24T16:43:24.523748",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [
|
9 |
+
"config_full.yaml"
|
10 |
+
],
|
11 |
+
"state": "running",
|
12 |
+
"program": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py",
|
13 |
+
"codePathLocal": "run_sft.py",
|
14 |
+
"codePath": "run_sft.py",
|
15 |
+
"git": {
|
16 |
+
"remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat",
|
17 |
+
"commit": "cbea69c6b95c970317a1e47c3f614b55b33f8ed9"
|
18 |
+
},
|
19 |
+
"email": null,
|
20 |
+
"root": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat",
|
21 |
+
"host": "ip-26-0-162-233",
|
22 |
+
"username": "sanchit",
|
23 |
+
"executable": "/fsx/sanchit/miniconda3/envs/venv/bin/python",
|
24 |
+
"cpu_count": 96,
|
25 |
+
"cpu_count_logical": 96,
|
26 |
+
"cpu_freq": {
|
27 |
+
"current": 2729.8387291666663,
|
28 |
+
"min": 0.0,
|
29 |
+
"max": 0.0
|
30 |
+
},
|
31 |
+
"cpu_freq_per_core": [
|
32 |
+
{
|
33 |
+
"current": 2650.0,
|
34 |
+
"min": 0.0,
|
35 |
+
"max": 0.0
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"current": 2650.0,
|
39 |
+
"min": 0.0,
|
40 |
+
"max": 0.0
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"current": 3598.161,
|
44 |
+
"min": 0.0,
|
45 |
+
"max": 0.0
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"current": 2650.0,
|
49 |
+
"min": 0.0,
|
50 |
+
"max": 0.0
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"current": 3584.12,
|
54 |
+
"min": 0.0,
|
55 |
+
"max": 0.0
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"current": 2650.0,
|
59 |
+
"min": 0.0,
|
60 |
+
"max": 0.0
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"current": 2650.0,
|
64 |
+
"min": 0.0,
|
65 |
+
"max": 0.0
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"current": 2650.0,
|
69 |
+
"min": 0.0,
|
70 |
+
"max": 0.0
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"current": 2650.0,
|
74 |
+
"min": 0.0,
|
75 |
+
"max": 0.0
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"current": 2650.0,
|
79 |
+
"min": 0.0,
|
80 |
+
"max": 0.0
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"current": 2650.0,
|
84 |
+
"min": 0.0,
|
85 |
+
"max": 0.0
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"current": 3598.175,
|
89 |
+
"min": 0.0,
|
90 |
+
"max": 0.0
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"current": 2650.0,
|
94 |
+
"min": 0.0,
|
95 |
+
"max": 0.0
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"current": 2650.0,
|
99 |
+
"min": 0.0,
|
100 |
+
"max": 0.0
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"current": 2650.0,
|
104 |
+
"min": 0.0,
|
105 |
+
"max": 0.0
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"current": 2650.0,
|
109 |
+
"min": 0.0,
|
110 |
+
"max": 0.0
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"current": 2650.0,
|
114 |
+
"min": 0.0,
|
115 |
+
"max": 0.0
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"current": 3598.329,
|
119 |
+
"min": 0.0,
|
120 |
+
"max": 0.0
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"current": 2650.0,
|
124 |
+
"min": 0.0,
|
125 |
+
"max": 0.0
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"current": 2650.0,
|
129 |
+
"min": 0.0,
|
130 |
+
"max": 0.0
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"current": 2650.0,
|
134 |
+
"min": 0.0,
|
135 |
+
"max": 0.0
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"current": 2650.0,
|
139 |
+
"min": 0.0,
|
140 |
+
"max": 0.0
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"current": 2650.0,
|
144 |
+
"min": 0.0,
|
145 |
+
"max": 0.0
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"current": 2650.0,
|
149 |
+
"min": 0.0,
|
150 |
+
"max": 0.0
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"current": 3596.81,
|
154 |
+
"min": 0.0,
|
155 |
+
"max": 0.0
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"current": 2650.0,
|
159 |
+
"min": 0.0,
|
160 |
+
"max": 0.0
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"current": 2650.0,
|
164 |
+
"min": 0.0,
|
165 |
+
"max": 0.0
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"current": 3598.102,
|
169 |
+
"min": 0.0,
|
170 |
+
"max": 0.0
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"current": 2650.0,
|
174 |
+
"min": 0.0,
|
175 |
+
"max": 0.0
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"current": 2650.0,
|
179 |
+
"min": 0.0,
|
180 |
+
"max": 0.0
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"current": 2650.0,
|
184 |
+
"min": 0.0,
|
185 |
+
"max": 0.0
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"current": 2650.0,
|
189 |
+
"min": 0.0,
|
190 |
+
"max": 0.0
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"current": 2650.0,
|
194 |
+
"min": 0.0,
|
195 |
+
"max": 0.0
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"current": 2650.0,
|
199 |
+
"min": 0.0,
|
200 |
+
"max": 0.0
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"current": 2650.0,
|
204 |
+
"min": 0.0,
|
205 |
+
"max": 0.0
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"current": 2650.0,
|
209 |
+
"min": 0.0,
|
210 |
+
"max": 0.0
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"current": 3596.611,
|
214 |
+
"min": 0.0,
|
215 |
+
"max": 0.0
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"current": 2650.0,
|
219 |
+
"min": 0.0,
|
220 |
+
"max": 0.0
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"current": 2650.0,
|
224 |
+
"min": 0.0,
|
225 |
+
"max": 0.0
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"current": 2650.0,
|
229 |
+
"min": 0.0,
|
230 |
+
"max": 0.0
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"current": 3598.198,
|
234 |
+
"min": 0.0,
|
235 |
+
"max": 0.0
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"current": 2650.0,
|
239 |
+
"min": 0.0,
|
240 |
+
"max": 0.0
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"current": 2650.0,
|
244 |
+
"min": 0.0,
|
245 |
+
"max": 0.0
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"current": 2650.0,
|
249 |
+
"min": 0.0,
|
250 |
+
"max": 0.0
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"current": 2650.0,
|
254 |
+
"min": 0.0,
|
255 |
+
"max": 0.0
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"current": 2650.0,
|
259 |
+
"min": 0.0,
|
260 |
+
"max": 0.0
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"current": 2650.0,
|
264 |
+
"min": 0.0,
|
265 |
+
"max": 0.0
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"current": 2650.0,
|
269 |
+
"min": 0.0,
|
270 |
+
"max": 0.0
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"current": 2650.0,
|
274 |
+
"min": 0.0,
|
275 |
+
"max": 0.0
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"current": 2650.0,
|
279 |
+
"min": 0.0,
|
280 |
+
"max": 0.0
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"current": 2650.0,
|
284 |
+
"min": 0.0,
|
285 |
+
"max": 0.0
|
286 |
+
},
|
287 |
+
{
|
288 |
+
"current": 2650.0,
|
289 |
+
"min": 0.0,
|
290 |
+
"max": 0.0
|
291 |
+
},
|
292 |
+
{
|
293 |
+
"current": 2650.0,
|
294 |
+
"min": 0.0,
|
295 |
+
"max": 0.0
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"current": 2650.0,
|
299 |
+
"min": 0.0,
|
300 |
+
"max": 0.0
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"current": 2650.0,
|
304 |
+
"min": 0.0,
|
305 |
+
"max": 0.0
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"current": 2650.0,
|
309 |
+
"min": 0.0,
|
310 |
+
"max": 0.0
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"current": 2650.0,
|
314 |
+
"min": 0.0,
|
315 |
+
"max": 0.0
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"current": 2650.0,
|
319 |
+
"min": 0.0,
|
320 |
+
"max": 0.0
|
321 |
+
},
|
322 |
+
{
|
323 |
+
"current": 2650.0,
|
324 |
+
"min": 0.0,
|
325 |
+
"max": 0.0
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"current": 2650.0,
|
329 |
+
"min": 0.0,
|
330 |
+
"max": 0.0
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"current": 2650.0,
|
334 |
+
"min": 0.0,
|
335 |
+
"max": 0.0
|
336 |
+
},
|
337 |
+
{
|
338 |
+
"current": 2650.0,
|
339 |
+
"min": 0.0,
|
340 |
+
"max": 0.0
|
341 |
+
},
|
342 |
+
{
|
343 |
+
"current": 2650.0,
|
344 |
+
"min": 0.0,
|
345 |
+
"max": 0.0
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"current": 2650.0,
|
349 |
+
"min": 0.0,
|
350 |
+
"max": 0.0
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"current": 2650.0,
|
354 |
+
"min": 0.0,
|
355 |
+
"max": 0.0
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"current": 2650.0,
|
359 |
+
"min": 0.0,
|
360 |
+
"max": 0.0
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"current": 2650.0,
|
364 |
+
"min": 0.0,
|
365 |
+
"max": 0.0
|
366 |
+
},
|
367 |
+
{
|
368 |
+
"current": 2650.0,
|
369 |
+
"min": 0.0,
|
370 |
+
"max": 0.0
|
371 |
+
},
|
372 |
+
{
|
373 |
+
"current": 2650.0,
|
374 |
+
"min": 0.0,
|
375 |
+
"max": 0.0
|
376 |
+
},
|
377 |
+
{
|
378 |
+
"current": 2650.0,
|
379 |
+
"min": 0.0,
|
380 |
+
"max": 0.0
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"current": 2650.0,
|
384 |
+
"min": 0.0,
|
385 |
+
"max": 0.0
|
386 |
+
},
|
387 |
+
{
|
388 |
+
"current": 2650.0,
|
389 |
+
"min": 0.0,
|
390 |
+
"max": 0.0
|
391 |
+
},
|
392 |
+
{
|
393 |
+
"current": 2650.0,
|
394 |
+
"min": 0.0,
|
395 |
+
"max": 0.0
|
396 |
+
},
|
397 |
+
{
|
398 |
+
"current": 2650.0,
|
399 |
+
"min": 0.0,
|
400 |
+
"max": 0.0
|
401 |
+
},
|
402 |
+
{
|
403 |
+
"current": 2650.0,
|
404 |
+
"min": 0.0,
|
405 |
+
"max": 0.0
|
406 |
+
},
|
407 |
+
{
|
408 |
+
"current": 2650.0,
|
409 |
+
"min": 0.0,
|
410 |
+
"max": 0.0
|
411 |
+
},
|
412 |
+
{
|
413 |
+
"current": 2650.0,
|
414 |
+
"min": 0.0,
|
415 |
+
"max": 0.0
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"current": 2650.0,
|
419 |
+
"min": 0.0,
|
420 |
+
"max": 0.0
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"current": 2650.0,
|
424 |
+
"min": 0.0,
|
425 |
+
"max": 0.0
|
426 |
+
},
|
427 |
+
{
|
428 |
+
"current": 2650.0,
|
429 |
+
"min": 0.0,
|
430 |
+
"max": 0.0
|
431 |
+
},
|
432 |
+
{
|
433 |
+
"current": 2650.0,
|
434 |
+
"min": 0.0,
|
435 |
+
"max": 0.0
|
436 |
+
},
|
437 |
+
{
|
438 |
+
"current": 2650.0,
|
439 |
+
"min": 0.0,
|
440 |
+
"max": 0.0
|
441 |
+
},
|
442 |
+
{
|
443 |
+
"current": 2650.0,
|
444 |
+
"min": 0.0,
|
445 |
+
"max": 0.0
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"current": 2650.0,
|
449 |
+
"min": 0.0,
|
450 |
+
"max": 0.0
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"current": 2650.0,
|
454 |
+
"min": 0.0,
|
455 |
+
"max": 0.0
|
456 |
+
},
|
457 |
+
{
|
458 |
+
"current": 2650.0,
|
459 |
+
"min": 0.0,
|
460 |
+
"max": 0.0
|
461 |
+
},
|
462 |
+
{
|
463 |
+
"current": 2650.0,
|
464 |
+
"min": 0.0,
|
465 |
+
"max": 0.0
|
466 |
+
},
|
467 |
+
{
|
468 |
+
"current": 2650.0,
|
469 |
+
"min": 0.0,
|
470 |
+
"max": 0.0
|
471 |
+
},
|
472 |
+
{
|
473 |
+
"current": 2650.0,
|
474 |
+
"min": 0.0,
|
475 |
+
"max": 0.0
|
476 |
+
},
|
477 |
+
{
|
478 |
+
"current": 2650.0,
|
479 |
+
"min": 0.0,
|
480 |
+
"max": 0.0
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"current": 2650.0,
|
484 |
+
"min": 0.0,
|
485 |
+
"max": 0.0
|
486 |
+
},
|
487 |
+
{
|
488 |
+
"current": 2650.0,
|
489 |
+
"min": 0.0,
|
490 |
+
"max": 0.0
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"current": 2650.0,
|
494 |
+
"min": 0.0,
|
495 |
+
"max": 0.0
|
496 |
+
},
|
497 |
+
{
|
498 |
+
"current": 2650.0,
|
499 |
+
"min": 0.0,
|
500 |
+
"max": 0.0
|
501 |
+
},
|
502 |
+
{
|
503 |
+
"current": 2650.0,
|
504 |
+
"min": 0.0,
|
505 |
+
"max": 0.0
|
506 |
+
},
|
507 |
+
{
|
508 |
+
"current": 2650.0,
|
509 |
+
"min": 0.0,
|
510 |
+
"max": 0.0
|
511 |
+
}
|
512 |
+
],
|
513 |
+
"disk": {
|
514 |
+
"/": {
|
515 |
+
"total": 290.7472343444824,
|
516 |
+
"used": 59.25613021850586
|
517 |
+
}
|
518 |
+
},
|
519 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
520 |
+
"gpu_count": 8,
|
521 |
+
"gpu_devices": [
|
522 |
+
{
|
523 |
+
"name": "NVIDIA H100 80GB HBM3",
|
524 |
+
"memory_total": 85520809984
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"name": "NVIDIA H100 80GB HBM3",
|
528 |
+
"memory_total": 85520809984
|
529 |
+
},
|
530 |
+
{
|
531 |
+
"name": "NVIDIA H100 80GB HBM3",
|
532 |
+
"memory_total": 85520809984
|
533 |
+
},
|
534 |
+
{
|
535 |
+
"name": "NVIDIA H100 80GB HBM3",
|
536 |
+
"memory_total": 85520809984
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"name": "NVIDIA H100 80GB HBM3",
|
540 |
+
"memory_total": 85520809984
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"name": "NVIDIA H100 80GB HBM3",
|
544 |
+
"memory_total": 85520809984
|
545 |
+
},
|
546 |
+
{
|
547 |
+
"name": "NVIDIA H100 80GB HBM3",
|
548 |
+
"memory_total": 85520809984
|
549 |
+
},
|
550 |
+
{
|
551 |
+
"name": "NVIDIA H100 80GB HBM3",
|
552 |
+
"memory_total": 85520809984
|
553 |
+
}
|
554 |
+
],
|
555 |
+
"memory": {
|
556 |
+
"total": 1999.9855270385742
|
557 |
+
}
|
558 |
+
}
|
wandb/run-20240424_164324-xfbnm7qo/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"train/loss": 2.0215, "train/grad_norm": 4.125, "train/learning_rate": 9.987179487179488e-05, "train/epoch": 0.13, "train/global_step": 525, "_timestamp": 1713977997.0387745, "_runtime": 992.4967684745789, "_step": 21}
|
wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/run-20240424_164324-xfbnm7qo/logs/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-04-24 16:43:24,533 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
|
2 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Configure stats pid to 1854033
|
3 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
|
4 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
|
5 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
|
6 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
|
8 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug.log
|
9 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log
|
10 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():564] calling init triggers
|
11 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
|
12 |
+
config: {}
|
13 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():614] starting backend
|
14 |
+
2024-04-24 16:43:24,534 INFO MainThread:1854033 [wandb_init.py:init():618] setting up manager
|
15 |
+
2024-04-24 16:43:24,537 INFO MainThread:1854033 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
16 |
+
2024-04-24 16:43:24,541 INFO MainThread:1854033 [wandb_init.py:init():624] backend started and connected
|
17 |
+
2024-04-24 16:43:24,544 INFO MainThread:1854033 [wandb_init.py:init():716] updated telemetry
|
18 |
+
2024-04-24 16:43:24,569 INFO MainThread:1854033 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
|
19 |
+
2024-04-24 16:43:24,850 INFO MainThread:1854033 [wandb_run.py:_on_init():2254] communicating current version
|
20 |
+
2024-04-24 16:43:24,896 INFO MainThread:1854033 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
|
21 |
+
|
22 |
+
2024-04-24 16:43:24,896 INFO MainThread:1854033 [wandb_init.py:init():800] starting run threads in backend
|
23 |
+
2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_console_start():2233] atexit reg
|
24 |
+
2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2088] redirect: wrap_raw
|
25 |
+
2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2153] Wrapping output streams.
|
26 |
+
2024-04-24 16:43:30,532 INFO MainThread:1854033 [wandb_run.py:_redirect():2178] Redirects installed.
|
27 |
+
2024-04-24 16:43:30,533 INFO MainThread:1854033 [wandb_init.py:init():841] run started, returning control to user process
|
28 |
+
2024-04-24 16:43:30,535 INFO MainThread:1854033 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_16-42-31_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
|
wandb/run-20240424_164324-xfbnm7qo/run-xfbnm7qo.wandb
ADDED
Binary file (297 kB). View file
|
|