sanchit-gandhi HF staff commited on Apr 24

Commit

4ea2eae

•

1 Parent(s): cbea69c

Training in progress, step 500

Browse files

Files changed (43) hide show

accelerate_config.yaml +18 -0
alignment/__init__.py +12 -0
alignment/__pycache__/__init__.cpython-311.pyc +0 -0
alignment/__pycache__/configs.cpython-311.pyc +0 -0
alignment/__pycache__/data.cpython-311.pyc +0 -0
alignment/__pycache__/model_utils.cpython-311.pyc +0 -0
alignment/configs.py +254 -0
alignment/data.py +190 -0
alignment/model_utils.py +119 -0
alignment/release.py +106 -0
config.json +26 -0
config_full.yaml +45 -0
model.safetensors +3 -0
run_sft.py +218 -0
runs/Apr24_14-23-38_ip-26-0-162-233/events.out.tfevents.1713973415.ip-26-0-162-233.1840687.0 +3 -0
runs/Apr24_16-42-31_ip-26-0-162-233/events.out.tfevents.1713977002.ip-26-0-162-233.1854033.0 +3 -0
slurm_job.slurm +76 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0
training_args.bin +3 -0
wandb/debug-cli.sanchit.log +0 -0
wandb/debug-internal.log +0 -0
wandb/debug.log +28 -0
wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml +300 -0
wandb/run-20240424_154339-mwp0iutr/files/config.yaml +663 -0
wandb/run-20240424_154339-mwp0iutr/files/output.log +131 -0
wandb/run-20240424_154339-mwp0iutr/files/requirements.txt +223 -0
wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json +558 -0
wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json +1 -0
wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log +209 -0
wandb/run-20240424_154339-mwp0iutr/logs/debug.log +29 -0
wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb +0 -0
wandb/run-20240424_164324-xfbnm7qo/files/conda-environment.yaml +300 -0
wandb/run-20240424_164324-xfbnm7qo/files/config.yaml +663 -0
wandb/run-20240424_164324-xfbnm7qo/files/output.log +522 -0
wandb/run-20240424_164324-xfbnm7qo/files/requirements.txt +223 -0
wandb/run-20240424_164324-xfbnm7qo/files/wandb-metadata.json +558 -0
wandb/run-20240424_164324-xfbnm7qo/files/wandb-summary.json +1 -0
wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log +0 -0
wandb/run-20240424_164324-xfbnm7qo/logs/debug.log +28 -0
wandb/run-20240424_164324-xfbnm7qo/run-xfbnm7qo.wandb +0 -0

accelerate_config.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

alignment/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+__version__ = "0.3.0.dev0"
+from .configs import DataArguments, DPOConfig, H4ArgumentParser, ModelArguments, SFTConfig
+from .data import apply_chat_template, get_datasets
+from .model_utils import (
+    get_checkpoint,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+    is_adapter_model,
+)

alignment/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (752 Bytes). View file

alignment/__pycache__/configs.cpython-311.pyc ADDED Viewed

Binary file (14.1 kB). View file

alignment/__pycache__/data.cpython-311.pyc ADDED Viewed

Binary file (9.06 kB). View file

alignment/__pycache__/model_utils.cpython-311.pyc ADDED Viewed

Binary file (5.05 kB). View file

alignment/configs.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, NewType, Optional, Tuple
+import transformers
+from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, HfArgumentParser
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+DataClassType = NewType("DataClassType", Any)
+class H4ArgumentParser(HfArgumentParser):
+    def parse_yaml_and_args(self, yaml_arg: str, other_args: Optional[List[str]] = None) -> List[dataclass]:
+        """
+        Parse a YAML file and overwrite the default/loaded values with the values provided to the command line.
+        Args:
+            yaml_arg (`str`):
+                The path to the config file used
+            other_args (`List[str]`, *optional`):
+                A list of strings to parse as command line arguments, e.g. ['--arg=val', '--arg2=val2'].
+        Returns:
+            [`List[dataclass]`]: a list of dataclasses with the values from the YAML file and the command line
+        """
+        arg_list = self.parse_yaml_file(os.path.abspath(yaml_arg))
+        outputs = []
+        # strip other args list into dict of key-value pairs
+        other_args = {arg.split("=")[0].strip("-"): arg.split("=")[1] for arg in other_args}
+        used_args = {}
+        # overwrite the default/loaded value with the value provided to the command line
+        # adapted from https://github.com/huggingface/transformers/blob/d0b5002378daabf62769159add3e7d66d3f83c3b/src/transformers/hf_argparser.py#L327
+        for data_yaml, data_class in zip(arg_list, self.dataclass_types):
+            keys = {f.name for f in dataclasses.fields(data_yaml) if f.init}
+            inputs = {k: v for k, v in vars(data_yaml).items() if k in keys}
+            for arg, val in other_args.items():
+                # add only if in keys
+                if arg in keys:
+                    base_type = data_yaml.__dataclass_fields__[arg].type
+                    inputs[arg] = val
+                    # cast type for ints, floats (default to strings)
+                    if base_type in [int, float]:
+                        inputs[arg] = base_type(val)
+                    if base_type == List[str]:
+                        inputs[arg] = [str(v) for v in val.split(",")]
+                    # bool of a non-empty string is True, so we manually check for bools
+                    if base_type == bool:
+                        if val in ["true", "True"]:
+                            inputs[arg] = True
+                        else:
+                            inputs[arg] = False
+                    # add to used-args so we can check if double add
+                    if arg not in used_args:
+                        used_args[arg] = val
+                    else:
+                        raise ValueError(f"Duplicate argument provided: {arg}, may cause unexpected behavior")
+            obj = data_class(**inputs)
+            outputs.append(obj)
+        return outputs
+    def parse(self) -> DataClassType | Tuple[DataClassType]:
+        if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
+            # If we pass only one argument to the script and it's the path to a YAML file,
+            # let's parse it to get our arguments.
+            output = self.parse_yaml_file(os.path.abspath(sys.argv[1]))
+        # parse command line args and yaml file
+        elif len(sys.argv) > 2 and sys.argv[1].endswith(".yaml"):
+            output = self.parse_yaml_and_args(os.path.abspath(sys.argv[1]), sys.argv[2:])
+        # parse command line args only
+        else:
+            output = self.parse_args_into_dataclasses()
+        if len(output) == 1:
+            output = output[0]
+        return output
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
+    """
+    base_model_revision: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The base model checkpoint for weights initialization with PEFT adatpers.")},
+    )
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
+            )
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    model_code_revision: str = field(default=None, metadata={"help": "The branch of the IFT model"})
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
+                "dtype will be automatically derived from the model's weights."
+            ),
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
+    use_flash_attention_2: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to use flash attention 2. You must install this manually by running `pip install flash-attn --no-build-isolation`"
+            )
+        },
+    )
+    use_peft: bool = field(
+        default=False,
+        metadata={"help": ("Whether to use PEFT or not for training.")},
+    )
+    lora_r: Optional[int] = field(
+        default=16,
+        metadata={"help": ("LoRA R value.")},
+    )
+    lora_alpha: Optional[int] = field(
+        default=32,
+        metadata={"help": ("LoRA alpha.")},
+    )
+    lora_dropout: Optional[float] = field(
+        default=0.05,
+        metadata={"help": ("LoRA dropout.")},
+    )
+    lora_target_modules: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("LoRA target modules.")},
+    )
+    lora_modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("Model layers to unfreeze & train")},
+    )
+    load_in_8bit: bool = field(default=False, metadata={"help": "use 8 bit precision"})
+    load_in_4bit: bool = field(default=False, metadata={"help": "use 4 bit precision"})
+    bnb_4bit_quant_type: Optional[str] = field(
+        default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
+    )
+    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
+    def __post_init__(self):
+        if self.load_in_8bit and self.load_in_4bit:
+            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
+    dataset_mixer: Optional[Dict[str, float]] = field(
+        default=None,
+        metadata={"help": ("Datasets and their proportions to be used for training ift/rl.")},
+    )
+    dataset_splits: Optional[List[str]] = field(
+        default_factory=lambda: ["train", "test"],
+        metadata={"help": ("List of train test splits to use in the dataset")},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    truncation_side: Optional[str] = field(
+        default=None, metadata={"help": "Truncation side to use for the tokenizer."}
+    )
+@dataclass
+class SFTConfig(transformers.TrainingArguments):
+    """
+    Arguments related to the training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
+    """
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
+    )
+    logging_first_step: bool = field(
+        default=True,
+        metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
+    )
+    optim: Optional[str] = field(default="adamw_torch")
+@dataclass
+class DPOConfig(transformers.TrainingArguments):
+    """
+    Arguments related to the DPO training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
+    """
+    beta: Optional[float] = field(
+        default=0.1,
+        metadata={"help": "The beta factor in DPO loss. Higher beta means less divergence from the initial policy."},
+    )
+    hub_model_revision: Optional[str] = field(
+        default="main",
+        metadata={"help": ("The Hub model branch to push the model to.")},
+    )
+    logging_first_step: bool = field(
+        default=True,
+        metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
+    )
+    max_prompt_length: Optional[int] = field(
+        default=None,
+        metadata={"help": ("For DPO, the maximum length of the prompt to use for conditioning the model.")},
+    )
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
+    )
+    optim: Optional[str] = field(default="rmsprop")
+    remove_unused_columns: bool = field(default=False)
+    loss_type: Optional[str] = field(default="sigmoid", metadata={"help": ("The loss type for DPO.")})

alignment/data.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import List, Literal, Optional
+from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
+from datasets.builder import DatasetGenerationError
+from .configs import DataArguments
+DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+def maybe_insert_system_message(messages, tokenizer):
+    if messages[0]["role"] == "system":
+        return
+    # chat template can be one of two attributes, we check in order
+    chat_template = tokenizer.chat_template
+    if chat_template is None:
+        chat_template = tokenizer.default_chat_template
+    # confirm the jinja template refers to a system message before inserting
+    if "system" in chat_template:
+        messages.insert(0, {"role": "system", "content": ""})
+def apply_chat_template(
+    example,
+    tokenizer,
+    task: Literal["sft", "generation", "rm", "dpo"],
+):
+    if task in ["sft", "generation"]:
+        messages = example["messages"]
+        # We add an empty system message if there is none
+        maybe_insert_system_message(messages, tokenizer)
+        example["text"] = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True if task == "generation" else False
+        )
+    elif task == "rm":
+        if all(k in example.keys() for k in ("chosen", "rejected")):
+            chosen_messages = example["chosen"]
+            rejected_messages = example["rejected"]
+            # We add an empty system message if there is none
+            maybe_insert_system_message(chosen_messages, tokenizer)
+            maybe_insert_system_message(rejected_messages, tokenizer)
+            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
+            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
+        else:
+            raise ValueError(
+                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
+            )
+    elif task == "dpo":
+        if all(k in example.keys() for k in ("chosen", "rejected")):
+            # For DPO, the inputs are triples of (prompt, chosen, rejected), where `chosen` and `rejected` are the final turn of a dialogue
+            # We therefore need to extract the N-1 turns to form the prompt
+            prompt_messages = example["chosen"][:-1]
+            # Prepend a system message if the first message is not a system message
+            if example["chosen"][0]["role"] != "system":
+                prompt_messages.insert(0, {"role": "system", "content": ""})
+            # Now we extract the final turn to define chosen/rejected responses
+            chosen_messages = example["chosen"][-1:]
+            rejected_messages = example["rejected"][-1:]
+            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
+            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
+            example["text_prompt"] = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
+        else:
+            raise ValueError(
+                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
+            )
+    else:
+        raise ValueError(
+            f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
+        )
+    return example
+def get_datasets(
+    data_config: DataArguments | dict,
+    splits: List[str] = ["train", "test"],
+    shuffle: bool = True,
+) -> DatasetDict:
+    """
+    Loads one or more datasets with varying training set proportions.
+    Args:
+        data_config (`DataArguments` or `dict`):
+            Dataset configuration and split proportions.
+        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
+            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training and testing/validation data.
+    Returns
+        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
+    """
+    if type(data_config) is DataArguments:
+        # Structure of the config to read the datasets and their mix
+        # datasets_mixer:
+        #     - 'dataset1': 0.5
+        #     - 'dataset2': 0.3
+        #     - 'dataset3': 0.2
+        dataset_mixer = data_config.dataset_mixer
+    elif isinstance(data_config, dict):
+        # Structure of the input is:
+        #     dataset_mixer = {
+        #             "dataset1": 0.5,
+        #             "dataset1": 0.3,
+        #             "dataset1": 0.2,
+        #         }
+        dataset_mixer = data_config
+    else:
+        raise ValueError(f"Data config {data_config} not recognized.")
+    raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
+    return raw_datasets
+def mix_datasets(dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True) -> DatasetDict:
+    """
+    Loads and mixes datasets according to proportions specified in `dataset_mixer`.
+    Args:
+        dataset_mixer (`dict`):
+            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
+        splits (Optional[List[str]], *optional*, defaults to `None`):
+            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training and testing/validation data.
+    """
+    raw_datasets = DatasetDict()
+    raw_train_datasets = []
+    raw_val_datasets = []
+    fracs = []
+    for ds, frac in dataset_mixer.items():
+        fracs.append(frac)
+        for idx, split in enumerate(splits):
+            try:
+                # Try first if dataset on a Hub repo
+                dataset = load_dataset(ds, split=split)
+            except DatasetGenerationError:
+                # If not, check local dataset
+                dataset = load_from_disk(os.path.join(ds, split))
+            if idx == 0:
+                raw_train_datasets.append(dataset)
+            else:
+                raw_val_datasets.append(dataset)
+    if any(frac < 0 for frac in fracs):
+        raise ValueError("Dataset fractions cannot be negative.")
+    if len(raw_train_datasets) > 0:
+        train_subsets = []
+        for dataset, frac in zip(raw_train_datasets, fracs):
+            train_subset = dataset.select(range(int(frac * len(dataset))))
+            train_subsets.append(train_subset)
+        if shuffle:
+            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
+        else:
+            raw_datasets["train"] = concatenate_datasets(train_subsets)
+    # No subsampling for test datasets to enable fair comparison across models
+    if len(raw_val_datasets) > 0:
+        if shuffle:
+            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
+        else:
+            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)
+    if len(raw_datasets) == 0:
+        raise ValueError(
+            f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
+        )
+    return raw_datasets

alignment/model_utils.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+from typing import Dict
+import torch
+from transformers import AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer
+from transformers.trainer_utils import get_last_checkpoint
+from accelerate import Accelerator
+from huggingface_hub import list_repo_files
+from huggingface_hub.utils._validators import HFValidationError
+from peft import LoraConfig, PeftConfig
+from .configs import DataArguments, DPOConfig, ModelArguments, SFTConfig
+from .data import DEFAULT_CHAT_TEMPLATE
+def get_current_device() -> int:
+    """Get the current device. For GPU we return the local process index to enable multiple GPU training."""
+    return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
+def get_kbit_device_map() -> Dict[str, int] | None:
+    """Useful for running inference with quantized models by setting `device_map=get_peft_device_map()`"""
+    return {"": get_current_device()} if torch.cuda.is_available() else None
+def get_quantization_config(model_args: ModelArguments) -> BitsAndBytesConfig | None:
+    if model_args.load_in_4bit:
+        compute_dtype = torch.float16
+        if model_args.torch_dtype not in {"auto", None}:
+            compute_dtype = getattr(torch, model_args.torch_dtype)
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
+            bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
+        )
+    elif model_args.load_in_8bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+        )
+    else:
+        quantization_config = None
+    return quantization_config
+def get_tokenizer(model_args: ModelArguments, data_args: DataArguments) -> PreTrainedTokenizer:
+    """Get the tokenizer for the model."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        revision=model_args.model_revision,
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    if data_args.truncation_side is not None:
+        tokenizer.truncation_side = data_args.truncation_side
+    # Set reasonable default for models without max length
+    if tokenizer.model_max_length > 100_000:
+        tokenizer.model_max_length = 2048
+    if data_args.chat_template is not None:
+        tokenizer.chat_template = data_args.chat_template
+    elif tokenizer.chat_template is None and tokenizer.default_chat_template is None:
+        tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
+    return tokenizer
+def get_peft_config(model_args: ModelArguments) -> PeftConfig | None:
+    if model_args.use_peft is False:
+        return None
+    peft_config = LoraConfig(
+        r=model_args.lora_r,
+        lora_alpha=model_args.lora_alpha,
+        lora_dropout=model_args.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=model_args.lora_target_modules,
+        modules_to_save=model_args.lora_modules_to_save,
+    )
+    return peft_config
+def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
+    try:
+        # Try first if model on a Hub repo
+        repo_files = list_repo_files(model_name_or_path, revision=revision)
+    except HFValidationError:
+        # If not, check local repo
+        repo_files = os.listdir(model_name_or_path)
+    return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files
+def get_checkpoint(training_args: SFTConfig | DPOConfig) -> Path | None:
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+    return last_checkpoint

alignment/release.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import re
+import packaging.version
+REPLACE_PATTERNS = {
+    "init": (re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE), '__version__ = "VERSION"\n'),
+    "setup": (re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE), r'\1version="VERSION",'),
+}
+REPLACE_FILES = {
+    "init": "src/alignment/__init__.py",
+    "setup": "setup.py",
+}
+README_FILE = "README.md"
+def update_version_in_file(fname, version, pattern):
+    """Update the version in one file using a specific pattern."""
+    with open(fname, "r", encoding="utf-8", newline="\n") as f:
+        code = f.read()
+    re_pattern, replace = REPLACE_PATTERNS[pattern]
+    replace = replace.replace("VERSION", version)
+    code = re_pattern.sub(replace, code)
+    with open(fname, "w", encoding="utf-8", newline="\n") as f:
+        f.write(code)
+def global_version_update(version, patch=False):
+    """Update the version in all needed files."""
+    for pattern, fname in REPLACE_FILES.items():
+        update_version_in_file(fname, version, pattern)
+def get_version():
+    """Reads the current version in the __init__."""
+    with open(REPLACE_FILES["init"], "r") as f:
+        code = f.read()
+    default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
+    return packaging.version.parse(default_version)
+def pre_release_work(patch=False):
+    """Do all the necessary pre-release steps."""
+    # First let's get the default version: base version if we are in dev, bump minor otherwise.
+    default_version = get_version()
+    if patch and default_version.is_devrelease:
+        raise ValueError("Can't create a patch version from the dev branch, checkout a released version!")
+    if default_version.is_devrelease:
+        default_version = default_version.base_version
+    elif patch:
+        default_version = f"{default_version.major}.{default_version.minor}.{default_version.micro + 1}"
+    else:
+        default_version = f"{default_version.major}.{default_version.minor + 1}.0"
+    # Now let's ask nicely if that's the right one.
+    version = input(f"Which version are you releasing? [{default_version}]")
+    if len(version) == 0:
+        version = default_version
+    print(f"Updating version to {version}.")
+    global_version_update(version, patch=patch)
+def post_release_work():
+    """Do all the necessary post-release steps."""
+    # First let's get the current version
+    current_version = get_version()
+    dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0"
+    current_version = current_version.base_version
+    # Check with the user we got that right.
+    version = input(f"Which version are we developing now? [{dev_version}]")
+    if len(version) == 0:
+        version = dev_version
+    print(f"Updating version to {version}.")
+    global_version_update(version)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--post_release", action="store_true", help="Whether this is pre or post release.")
+    parser.add_argument("--patch", action="store_true", help="Whether or not this is a patch release.")
+    args = parser.parse_args()
+    if not args.post_release:
+        pre_release_work(patch=args.patch)
+    elif args.patch:
+        print("Nothing to do after a patch :-)")
+    else:
+        post_release_work()

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "sanchit-gandhi/Mistral-7B-v0.1-6-layer",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

config_full.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+# Model arguments
+model_name_or_path: sanchit-gandhi/Mistral-7B-v0.1-6-layer
+model_revision: main
+torch_dtype: bfloat16
+use_flash_attention_2: false  # torch sdpa sufficient
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  stingning/ultrachat: 1.0
+dataset_splits:
+- train[1000:]
+- train[:1000]
+preprocessing_num_workers: 32
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: steps
+eval_steps: 5000
+save_strategy: "steps"
+save_total_limit: 5000
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_strategy: every_save
+learning_rate: 0.0001
+log_level: info
+logging_steps: 25
+logging_strategy: steps
+max_seq_length: 2048
+max_steps: 20000
+output_dir: ./
+overwrite_output_dir: true
+per_device_eval_batch_size: 32
+per_device_train_batch_size: 32
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+seed: 42
+warmup_steps: 500
+ddp_timeout: 7200

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7f84b30ad1e26b72493f2e487a84b8fb077327a611d56fcd0605d78146fa822
+size 3141646744

run_sft.py ADDED Viewed

	@@ -0,0 +1,218 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Supervised fine-tuning script for decoder language models.
+"""
+import logging
+import random
+import sys
+import datasets
+import torch
+import transformers
+from transformers import set_seed
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    SFTConfig,
+    apply_chat_template,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import SFTTrainer
+logger = logging.getLogger(__name__)
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
+    model_args, data_args, training_args = parser.parse()
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = get_datasets(data_args, splits=data_args.dataset_splits)
+    logger.info(
+        f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+    if "messages" not in column_names:
+        with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
+            def format_messages(example):
+                messages = []
+                for idx, message in enumerate(example["data"]):
+                    role = "user" if idx % 2 == 0 else "assistant"
+                    messages.append({"content": message, "role": role})
+                example["messages"] = messages
+                return example
+            raw_datasets = raw_datasets.map(format_messages, desc="Formatting messages", num_proc=data_args.preprocessing_num_workers)
+    ################
+    # Load tokenizer
+    ################
+    tokenizer = get_tokenizer(model_args, data_args)
+    #####################
+    # Apply chat template
+    #####################
+    with training_args.main_process_first():
+        raw_datasets = raw_datasets.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": tokenizer, "task": "sft"},
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            desc="Applying chat template",
+        )
+    train_dataset = raw_datasets["train"]
+    eval_dataset = raw_datasets["test"]
+    with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
+        for index in random.sample(range(len(raw_datasets["train"])), 3):
+            logger.info(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")
+    #######################
+    # Load pretrained model
+    #######################
+    logger.info("*** Load pretrained model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        use_flash_attention_2=model_args.use_flash_attention_2,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    logger.info("*** Model loaded! ***")
+    ########################
+    # Initialize the Trainer
+    ########################
+    trainer = SFTTrainer(
+        model=model_args.model_name_or_path,
+        model_init_kwargs=model_kwargs,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        dataset_text_field="text",
+        max_seq_length=training_args.max_seq_length,
+        tokenizer=tokenizer,
+        packing=True,
+        peft_config=get_peft_config(model_args),
+    )
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(train_dataset)
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(eval_dataset)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+    logger.info("*** Training complete ***")
+if __name__ == "__main__":
+    main()

runs/Apr24_14-23-38_ip-26-0-162-233/events.out.tfevents.1713973415.ip-26-0-162-233.1840687.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:713e8ed73c7d50dde946e1af7c24c1babc165667a442d5f8e8f3674cf32ae072
+size 4886

runs/Apr24_16-42-31_ip-26-0-162-233/events.out.tfevents.1713977002.ip-26-0-162-233.1854033.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d99d51bb7fcd506f76273107b7f54b0a07695a2cf620317840bf9823aa458c38
+size 9086

slurm_job.slurm ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/bin/bash
+#SBATCH --job-name=distil-zephyr
+#SBATCH --nodes=1
+# set 24h for job wall time limit
+#SBATCH --time=24:00:00
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:8
+#SBATCH --exclusive
+#SBATCH --partition=hopper-prod
+#SBATCH --output=/fsx/sanchit/alignment-logs/%x-%j.out
+set -x -e
+# START EDIT
+source ~/.bashrc
+source /fsx/sanchit/miniconda3/bin/activate alignment
+LOG_PATH="/fsx/sanchit/alignment-logs/main_log.txt"
+SAVE_DIR="/fsx/sanchit"
+# END EDIT
+echo "START TIME: $(date)"
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+# so processes know who to talk to
+MASTER_ADDR=`scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1`
+# From https://i.hsfzxjy.site/2021-03-10-obtain-a-random-unused-tcp-port-with-bash/
+function unused_port() {
+    N=${1:-1}
+    comm -23 \
+        <(seq "1025" "65535" | sort) \
+        <(ss -Htan |
+            awk '{print $4}' |
+            cut -d':' -f2 |
+            sort -u) |
+        shuf |
+        head -n "$N"
+}
+MASTER_PORT=$(unused_port)
+# export TORCH_CPP_LOG_LEVEL=INFO
+# export TORCH_DISTRIBUTED_DEBUG=DETAIL
+export LAUNCHER="python -u -m accelerate.commands.launch --config_file ./accelerate_config.yaml"
+export PROGRAM="./run_sft.py ./config_full.yaml"
+export CMD="$LAUNCHER $PROGRAM"
+echo $CMD
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$CMD" 2>&1 | tee -a $SAVE_DIR/logs/main_log.txt
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+# SRUN_ARGS=" \
+#     --wait=60 \
+#     --kill-on-bad-exit=1 \
+#     "
+#
+# # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+# clear; srun $SRUN_ARGS --jobid $SLURM_JOBID bash -c "$CMD" 2>&1 | tee -a $SAVE_DIR/logs/main_log.txt
+echo "END TIME: $(date)"

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1464ab5521091ef44c1647b6866ecc70515e4a2469ed5b7ed407275c3c551c0d
+size 4984

wandb/debug-cli.sanchit.log ADDED Viewed

File without changes

wandb/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2024-04-24 16:43:24,533 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Configure stats pid to 1854033
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug.log
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:init():564] calling init triggers
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
+config: {}
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:init():614] starting backend
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:init():618] setting up manager
+2024-04-24 16:43:24,537 INFO    MainThread:1854033 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-04-24 16:43:24,541 INFO    MainThread:1854033 [wandb_init.py:init():624] backend started and connected
+2024-04-24 16:43:24,544 INFO    MainThread:1854033 [wandb_init.py:init():716] updated telemetry
+2024-04-24 16:43:24,569 INFO    MainThread:1854033 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
+2024-04-24 16:43:24,850 INFO    MainThread:1854033 [wandb_run.py:_on_init():2254] communicating current version
+2024-04-24 16:43:24,896 INFO    MainThread:1854033 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-04-24 16:43:24,896 INFO    MainThread:1854033 [wandb_init.py:init():800] starting run threads in backend
+2024-04-24 16:43:30,532 INFO    MainThread:1854033 [wandb_run.py:_console_start():2233] atexit reg
+2024-04-24 16:43:30,532 INFO    MainThread:1854033 [wandb_run.py:_redirect():2088] redirect: wrap_raw
+2024-04-24 16:43:30,532 INFO    MainThread:1854033 [wandb_run.py:_redirect():2153] Wrapping output streams.
+2024-04-24 16:43:30,532 INFO    MainThread:1854033 [wandb_run.py:_redirect():2178] Redirects installed.
+2024-04-24 16:43:30,533 INFO    MainThread:1854033 [wandb_init.py:init():841] run started, returning control to user process
+2024-04-24 16:43:30,535 INFO    MainThread:1854033 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_16-42-31_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}

wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml ADDED Viewed

	@@ -0,0 +1,300 @@

+name: venv
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py311h6a678d5_7
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.12.12=h06a4308_0
+  - certifi=2023.11.17=py311h06a4308_0
+  - cffi=1.16.0=py311h5eee18b_0
+  - cryptography=41.0.7=py311hdda0065_0
+  - cuda-cudart=12.1.105=0
+  - cuda-cupti=12.1.105=0
+  - cuda-libraries=12.1.0=0
+  - cuda-nvrtc=12.1.105=0
+  - cuda-nvtx=12.1.105=0
+  - cuda-opencl=12.3.101=0
+  - cuda-runtime=12.1.0=0
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py311h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.1=h5eee18b_3
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py311hc9b5ff0_0
+  - gnutls=3.6.15=he1e5248_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.2=py311h06a4308_0
+  - jpeg=9e=h5eee18b_1
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=12.1.0.26=0
+  - libcufft=11.0.2.4=0
+  - libcufile=1.8.1.2=0
+  - libcurand=10.3.4.101=0
+  - libcusolver=11.4.4.55=0
+  - libcusparse=12.0.2.55=0
+  - libdeflate=1.17=h5eee18b_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.0.2.50=0
+  - libnvjitlink=12.1.105=0
+  - libnvjpeg=12.1.1.14=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp=1.3.2=h11a3e52_0
+  - libwebp-base=1.3.2=h5eee18b_0
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_0
+  - markupsafe=2.1.1=py311h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py311h5eee18b_1
+  - mkl_fft=1.3.8=py311h5eee18b_0
+  - mkl_random=1.2.4=py311hdb19cb5_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py311h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.1=py311h06a4308_0
+  - numpy=1.26.2=py311h08b1b3b_0
+  - numpy-base=1.26.2=py311hf175353_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.4.0=h3ad879b_0
+  - openssl=3.0.12=h7f8727e_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=23.2.0=py311h06a4308_0
+  - pysocks=1.7.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - pytorch-cuda=12.1=ha16c6d3_5
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.1=py311h5eee18b_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.31.0=py311h06a4308_0
+  - setuptools=68.2.2=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - sympy=1.12=py311h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.12=h1ccaba5_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.5=h5eee18b_0
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.5=hc292b87_0
+  - pip:
+      - absl-py==2.0.0
+      - accelerate==0.29.3
+      - aiohttp==3.9.1
+      - aiosignal==1.3.1
+      - annotated-types==0.6.0
+      - anyio==4.2.0
+      - appdirs==1.4.4
+      - argon2-cffi==23.1.0
+      - argon2-cffi-bindings==21.2.0
+      - arrow==1.3.0
+      - asttokens==2.4.1
+      - astunparse==1.6.3
+      - async-lru==2.0.4
+      - attrs==23.1.0
+      - audioread==3.0.1
+      - babel==2.14.0
+      - beautifulsoup4==4.12.3
+      - bitsandbytes==0.43.1
+      - bleach==6.1.0
+      - cachetools==5.3.2
+      - chardet==5.2.0
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - comm==0.2.1
+      - datasets==2.18.1.dev0
+      - debugpy==1.8.1
+      - decorator==5.1.1
+      - deepspeed==0.12.2
+      - defusedxml==0.7.1
+      - dill==0.3.7
+      - docker-pycreds==0.4.0
+      - docstring-parser==0.15
+      - einops==0.7.0
+      - evaluate==0.4.0
+      - executing==2.0.1
+      - fastjsonschema==2.19.1
+      - flatbuffers==23.5.26
+      - fqdn==1.5.1
+      - frozenlist==1.4.1
+      - fsspec==2023.10.0
+      - gast==0.5.4
+      - gitdb==4.0.11
+      - gitpython==3.1.40
+      - google-auth==2.26.1
+      - google-auth-oauthlib==1.2.0
+      - google-pasta==0.2.0
+      - grpcio==1.60.0
+      - h11==0.14.0
+      - h5py==3.10.0
+      - hf-transfer==0.1.5
+      - hjson==3.1.0
+      - httpcore==1.0.2
+      - httpx==0.26.0
+      - huggingface-hub==0.22.2
+      - idna==3.6
+      - ipdb==0.13.13
+      - ipykernel==6.29.2
+      - ipython==8.21.0
+      - isoduration==20.11.0
+      - jedi==0.19.1
+      - jiwer==3.0.3
+      - joblib==1.3.2
+      - json5==0.9.14
+      - jsonpointer==2.4
+      - jsonschema==4.21.1
+      - jsonschema-specifications==2023.12.1
+      - jupyter-client==8.6.0
+      - jupyter-core==5.7.1
+      - jupyter-events==0.9.0
+      - jupyter-lsp==2.2.2
+      - jupyter-server==2.12.5
+      - jupyter-server-terminals==0.5.2
+      - jupyterlab==4.1.1
+      - jupyterlab-pygments==0.3.0
+      - jupyterlab-server==2.25.2
+      - keras==2.15.0
+      - lazy-loader==0.3
+      - libclang==16.0.6
+      - librosa==0.10.1
+      - llvmlite==0.41.1
+      - markdown==3.5.1
+      - markdown-it-py==3.0.0
+      - matplotlib-inline==0.1.6
+      - mdurl==0.1.2
+      - mistune==3.0.2
+      - ml-dtypes==0.2.0
+      - msgpack==1.0.7
+      - multidict==6.0.4
+      - multiprocess==0.70.15
+      - nbclient==0.9.0
+      - nbconvert==7.16.0
+      - nbformat==5.9.2
+      - nest-asyncio==1.6.0
+      - ninja==1.11.1.1
+      - nltk==3.8.1
+      - notebook-shim==0.2.3
+      - numba==0.58.1
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.20.5
+      - nvidia-nvjitlink-cu12==12.3.101
+      - nvidia-nvtx-cu12==12.1.105
+      - oauthlib==3.2.2
+      - opt-einsum==3.3.0
+      - overrides==7.7.0
+      - packaging==23.2
+      - pandas==2.1.4
+      - pandocfilters==1.5.1
+      - parso==0.8.3
+      - peft==0.7.1
+      - pexpect==4.9.0
+      - pillow==10.2.0
+      - pip==24.0
+      - platformdirs==4.1.0
+      - pooch==1.8.0
+      - prometheus-client==0.19.0
+      - prompt-toolkit==3.0.43
+      - protobuf==3.20.2
+      - psutil==5.9.7
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - py-cpuinfo==9.0.0
+      - pyarrow==14.0.2
+      - pyarrow-hotfix==0.6
+      - pyasn1==0.5.1
+      - pyasn1-modules==0.3.0
+      - pydantic==2.6.0
+      - pydantic-core==2.16.1
+      - pygments==2.17.2
+      - pynvml==11.5.0
+      - python-dateutil==2.8.2
+      - python-json-logger==2.0.7
+      - pytorch-triton==3.0.0+989adb9a29
+      - pytz==2023.3.post1
+      - pyzmq==25.1.2
+      - rapidfuzz==3.6.1
+      - referencing==0.33.0
+      - regex==2023.12.25
+      - requests-oauthlib==1.3.1
+      - responses==0.18.0
+      - rfc3339-validator==0.1.4
+      - rfc3986-validator==0.1.1
+      - rich==13.7.0
+      - rpds-py==0.17.1
+      - rsa==4.9
+      - safetensors==0.4.1
+      - scikit-learn==1.3.2
+      - scipy==1.11.4
+      - send2trash==1.8.2
+      - sentencepiece==0.1.99
+      - sentry-sdk==1.39.1
+      - setproctitle==1.3.3
+      - shtab==1.6.5
+      - six==1.16.0
+      - smmap==5.0.1
+      - sniffio==1.3.0
+      - soundfile==0.12.1
+      - soupsieve==2.5
+      - soxr==0.3.7
+      - stack-data==0.6.3
+      - tensorboard==2.15.1
+      - tensorboard-data-server==0.7.2
+      - tensorflow-cpu==2.15.0.post1
+      - tensorflow-estimator==2.15.0
+      - tensorflow-io-gcs-filesystem==0.35.0
+      - termcolor==2.4.0
+      - terminado==0.18.0
+      - threadpoolctl==3.2.0
+      - tinycss2==1.2.1
+      - tokenizers==0.15.0
+      - torch==2.4.0.dev20240323+cu121
+      - torchaudio==2.2.0.dev20240323+cu121
+      - torchvision==0.19.0.dev20240323+cu121
+      - tornado==6.4
+      - tqdm==4.66.1
+      - traitlets==5.14.1
+      - transformers==4.39.0.dev0
+      - triton==2.2.0
+      - trl==0.8.6
+      - types-python-dateutil==2.8.19.20240106
+      - typing-extensions==4.10.0
+      - tyro==0.7.0
+      - tzdata==2023.3
+      - uri-template==1.3.0
+      - urllib3==2.1.0
+      - wandb==0.16.1
+      - wcwidth==0.2.13
+      - webcolors==1.13
+      - webencodings==0.5.1
+      - websocket-client==1.7.0
+      - werkzeug==3.0.1
+      - wrapt==1.14.1
+      - xxhash==3.4.1
+      - yarl==1.9.4
+prefix: /fsx/sanchit/miniconda3/envs/venv

wandb/run-20240424_154339-mwp0iutr/files/config.yaml ADDED Viewed

	@@ -0,0 +1,663 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    python_version: 3.11.5
+    cli_version: 0.16.1
+    framework: huggingface
+    huggingface_version: 4.40.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1713973419.470656
+    t:
+      1:
+      - 1
+      - 2
+      - 3
+      - 5
+      - 11
+      - 49
+      - 51
+      - 53
+      - 55
+      - 71
+      - 84
+      - 98
+      2:
+      - 1
+      - 2
+      - 3
+      - 5
+      - 11
+      - 49
+      - 51
+      - 53
+      - 55
+      - 71
+      - 84
+      - 98
+      3:
+      - 7
+      - 23
+      4: 3.11.5
+      5: 0.16.1
+      6: 4.40.0.dev0
+      8:
+      - 5
+      9:
+        1: transformers_trainer
+      13: linux-x86_64
+    m:
+    - 1: train/global_step
+      6:
+      - 3
+    - 1: train/loss
+      5: 1
+      6:
+      - 1
+    - 1: train/grad_norm
+      5: 1
+      6:
+      - 1
+    - 1: train/learning_rate
+      5: 1
+      6:
+      - 1
+    - 1: train/epoch
+      5: 1
+      6:
+      - 1
+vocab_size:
+  desc: null
+  value: 32000
+max_position_embeddings:
+  desc: null
+  value: 32768
+hidden_size:
+  desc: null
+  value: 4096
+intermediate_size:
+  desc: null
+  value: 14336
+num_hidden_layers:
+  desc: null
+  value: 6
+num_attention_heads:
+  desc: null
+  value: 32
+sliding_window:
+  desc: null
+  value: 4096
+num_key_value_heads:
+  desc: null
+  value: 8
+hidden_act:
+  desc: null
+  value: silu
+initializer_range:
+  desc: null
+  value: 0.02
+rms_norm_eps:
+  desc: null
+  value: 1.0e-05
+use_cache:
+  desc: null
+  value: false
+rope_theta:
+  desc: null
+  value: 10000.0
+attention_dropout:
+  desc: null
+  value: 0.0
+return_dict:
+  desc: null
+  value: true
+output_hidden_states:
+  desc: null
+  value: false
+output_attentions:
+  desc: null
+  value: false
+torchscript:
+  desc: null
+  value: false
+torch_dtype:
+  desc: null
+  value: bfloat16
+use_bfloat16:
+  desc: null
+  value: false
+tf_legacy_loss:
+  desc: null
+  value: false
+pruned_heads:
+  desc: null
+  value: {}
+tie_word_embeddings:
+  desc: null
+  value: false
+chunk_size_feed_forward:
+  desc: null
+  value: 0
+is_encoder_decoder:
+  desc: null
+  value: false
+is_decoder:
+  desc: null
+  value: false
+cross_attention_hidden_size:
+  desc: null
+  value: null
+add_cross_attention:
+  desc: null
+  value: false
+tie_encoder_decoder:
+  desc: null
+  value: false
+max_length:
+  desc: null
+  value: 20
+min_length:
+  desc: null
+  value: 0
+do_sample:
+  desc: null
+  value: false
+early_stopping:
+  desc: null
+  value: false
+num_beams:
+  desc: null
+  value: 1
+num_beam_groups:
+  desc: null
+  value: 1
+diversity_penalty:
+  desc: null
+  value: 0.0
+temperature:
+  desc: null
+  value: 1.0
+top_k:
+  desc: null
+  value: 50
+top_p:
+  desc: null
+  value: 1.0
+typical_p:
+  desc: null
+  value: 1.0
+repetition_penalty:
+  desc: null
+  value: 1.0
+length_penalty:
+  desc: null
+  value: 1.0
+no_repeat_ngram_size:
+  desc: null
+  value: 0
+encoder_no_repeat_ngram_size:
+  desc: null
+  value: 0
+bad_words_ids:
+  desc: null
+  value: null
+num_return_sequences:
+  desc: null
+  value: 1
+output_scores:
+  desc: null
+  value: false
+return_dict_in_generate:
+  desc: null
+  value: false
+forced_bos_token_id:
+  desc: null
+  value: null
+forced_eos_token_id:
+  desc: null
+  value: null
+remove_invalid_values:
+  desc: null
+  value: false
+exponential_decay_length_penalty:
+  desc: null
+  value: null
+suppress_tokens:
+  desc: null
+  value: null
+begin_suppress_tokens:
+  desc: null
+  value: null
+architectures:
+  desc: null
+  value:
+  - MistralForCausalLM
+finetuning_task:
+  desc: null
+  value: null
+id2label:
+  desc: null
+  value:
+    '0': LABEL_0
+    '1': LABEL_1
+label2id:
+  desc: null
+  value:
+    LABEL_0: 0
+    LABEL_1: 1
+tokenizer_class:
+  desc: null
+  value: null
+prefix:
+  desc: null
+  value: null
+bos_token_id:
+  desc: null
+  value: 1
+pad_token_id:
+  desc: null
+  value: null
+eos_token_id:
+  desc: null
+  value: 2
+sep_token_id:
+  desc: null
+  value: null
+decoder_start_token_id:
+  desc: null
+  value: null
+task_specific_params:
+  desc: null
+  value: null
+problem_type:
+  desc: null
+  value: null
+_name_or_path:
+  desc: null
+  value: sanchit-gandhi/Mistral-7B-v0.1-6-layer
+transformers_version:
+  desc: null
+  value: 4.40.0.dev0
+model_type:
+  desc: null
+  value: mistral
+output_dir:
+  desc: null
+  value: ./
+overwrite_output_dir:
+  desc: null
+  value: true
+do_train:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: true
+do_predict:
+  desc: null
+  value: false
+evaluation_strategy:
+  desc: null
+  value: steps
+prediction_loss_only:
+  desc: null
+  value: false
+per_device_train_batch_size:
+  desc: null
+  value: 64
+per_device_eval_batch_size:
+  desc: null
+  value: 32
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+gradient_accumulation_steps:
+  desc: null
+  value: 1
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_delay:
+  desc: null
+  value: 0
+learning_rate:
+  desc: null
+  value: 0.0001
+weight_decay:
+  desc: null
+  value: 0.0
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.999
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+max_grad_norm:
+  desc: null
+  value: 1.0
+num_train_epochs:
+  desc: null
+  value: 3.0
+max_steps:
+  desc: null
+  value: 20000
+lr_scheduler_type:
+  desc: null
+  value: linear
+lr_scheduler_kwargs:
+  desc: null
+  value: {}
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 500
+log_level:
+  desc: null
+  value: info
+log_level_replica:
+  desc: null
+  value: warning
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ./runs/Apr24_14-23-38_ip-26-0-162-233
+logging_strategy:
+  desc: null
+  value: steps
+logging_first_step:
+  desc: null
+  value: true
+logging_steps:
+  desc: null
+  value: 25
+logging_nan_inf_filter:
+  desc: null
+  value: true
+save_strategy:
+  desc: null
+  value: steps
+save_steps:
+  desc: null
+  value: 500
+save_total_limit:
+  desc: null
+  value: 5000
+save_safetensors:
+  desc: null
+  value: true
+save_on_each_node:
+  desc: null
+  value: false
+save_only_model:
+  desc: null
+  value: false
+no_cuda:
+  desc: null
+  value: false
+use_cpu:
+  desc: null
+  value: false
+use_mps_device:
+  desc: null
+  value: false
+seed:
+  desc: null
+  value: 42
+data_seed:
+  desc: null
+  value: null
+jit_mode_eval:
+  desc: null
+  value: false
+use_ipex:
+  desc: null
+  value: false
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+half_precision_backend:
+  desc: null
+  value: auto
+bf16_full_eval:
+  desc: null
+  value: false
+fp16_full_eval:
+  desc: null
+  value: false
+tf32:
+  desc: null
+  value: null
+local_rank:
+  desc: null
+  value: 0
+ddp_backend:
+  desc: null
+  value: null
+tpu_num_cores:
+  desc: null
+  value: null
+tpu_metrics_debug:
+  desc: null
+  value: false
+debug:
+  desc: null
+  value: []
+dataloader_drop_last:
+  desc: null
+  value: false
+eval_steps:
+  desc: null
+  value: 5000
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_prefetch_factor:
+  desc: null
+  value: null
+past_index:
+  desc: null
+  value: -1
+run_name:
+  desc: null
+  value: ./
+disable_tqdm:
+  desc: null
+  value: false
+remove_unused_columns:
+  desc: null
+  value: true
+label_names:
+  desc: null
+  value: null
+load_best_model_at_end:
+  desc: null
+  value: false
+metric_for_best_model:
+  desc: null
+  value: null
+greater_is_better:
+  desc: null
+  value: null
+ignore_data_skip:
+  desc: null
+  value: false
+fsdp:
+  desc: null
+  value: []
+fsdp_min_num_params:
+  desc: null
+  value: 0
+fsdp_config:
+  desc: null
+  value:
+    min_num_params: 0
+    xla: false
+    xla_fsdp_v2: false
+    xla_fsdp_grad_ckpt: false
+fsdp_transformer_layer_cls_to_wrap:
+  desc: null
+  value: null
+accelerator_config:
+  desc: null
+  value:
+    split_batches: false
+    dispatch_batches: null
+    even_batches: true
+    use_seedable_sampler: true
+    gradient_accumulation_kwargs: null
+deepspeed:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+optim:
+  desc: null
+  value: adamw_torch
+optim_args:
+  desc: null
+  value: null
+adafactor:
+  desc: null
+  value: false
+group_by_length:
+  desc: null
+  value: false
+length_column_name:
+  desc: null
+  value: length
+report_to:
+  desc: null
+  value:
+  - tensorboard
+  - wandb
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+ddp_bucket_cap_mb:
+  desc: null
+  value: null
+ddp_broadcast_buffers:
+  desc: null
+  value: null
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataloader_persistent_workers:
+  desc: null
+  value: false
+skip_memory_metrics:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+push_to_hub:
+  desc: null
+  value: true
+resume_from_checkpoint:
+  desc: null
+  value: null
+hub_model_id:
+  desc: null
+  value: null
+hub_strategy:
+  desc: null
+  value: every_save
+hub_token:
+  desc: null
+  value: <HUB_TOKEN>
+hub_private_repo:
+  desc: null
+  value: false
+hub_always_push:
+  desc: null
+  value: false
+gradient_checkpointing:
+  desc: null
+  value: true
+gradient_checkpointing_kwargs:
+  desc: null
+  value:
+    use_reentrant: false
+include_inputs_for_metrics:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+push_to_hub_model_id:
+  desc: null
+  value: null
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: <PUSH_TO_HUB_TOKEN>
+mp_parameters:
+  desc: null
+  value: ''
+auto_find_batch_size:
+  desc: null
+  value: false
+full_determinism:
+  desc: null
+  value: false
+torchdynamo:
+  desc: null
+  value: null
+ray_scope:
+  desc: null
+  value: last
+ddp_timeout:
+  desc: null
+  value: 7200
+torch_compile:
+  desc: null
+  value: false
+torch_compile_backend:
+  desc: null
+  value: null
+torch_compile_mode:
+  desc: null
+  value: null
+dispatch_batches:
+  desc: null
+  value: null
+split_batches:
+  desc: null
+  value: null
+include_tokens_per_second:
+  desc: null
+  value: false
+include_num_input_tokens_seen:
+  desc: null
+  value: false
+neftune_noise_alpha:
+  desc: null
+  value: null
+optim_target_modules:
+  desc: null
+  value: null
+max_seq_length:
+  desc: null
+  value: 2048

wandb/run-20240424_154339-mwp0iutr/files/output.log ADDED Viewed

	@@ -0,0 +1,131 @@

+  0%|                                                                                         | 0/20000 [00:00<?, ?it/s]/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
+  warnings.warn(
+  0%|                                                                              | 1/20000 [00:06<38:19:46,  6.90s/it]
+  0%|                                                                              | 1/20000 [00:06<38:19:46,  6.90s/it]Traceback (most recent call last):
+  File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 217, in <module>
+    main()
+  File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 172, in main
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 361, in train
+    output = super().train(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 1849, in train
+    return inner_training_loop(
+           ^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 2202, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3137, in training_step
+    loss = self.compute_loss(model, inputs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3160, in compute_loss
+    outputs = model(**inputs)
+              ^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1608, in forward
+    else self._run_ddp_forward(*inputs, **kwargs)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1426, in _run_ddp_forward
+    return self.module(*inputs, **kwargs)  # type: ignore[index]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 825, in forward
+    return model_forward(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 813, in __call__
+    return convert_to_fp32(self.model_forward(*args, **kwargs))
+                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
+    return func(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/transformers/src/transformers/models/mistral/modeling_mistral.py", line 1184, in forward
+    loss = loss_fct(shift_logits, shift_labels)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1185, in forward
+    return F.cross_entropy(input, target, weight=self.weight,
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/functional.py", line 3088, in cross_entropy
+    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.62 GiB. GPU
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 217, in <module>
+[rank0]:     main()
+[rank0]:   File "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py", line 172, in main
+[rank0]:     train_result = trainer.train(resume_from_checkpoint=checkpoint)
+[rank0]:                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 361, in train
+[rank0]:     output = super().train(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 1849, in train
+[rank0]:     return inner_training_loop(
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 2202, in _inner_training_loop
+[rank0]:     tr_loss_step = self.training_step(model, inputs)
+[rank0]:                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3137, in training_step
+[rank0]:     loss = self.compute_loss(model, inputs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/transformers/src/transformers/trainer.py", line 3160, in compute_loss
+[rank0]:     outputs = model(**inputs)
+[rank0]:               ^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1608, in forward
+[rank0]:     else self._run_ddp_forward(*inputs, **kwargs)
+[rank0]:          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1426, in _run_ddp_forward
+[rank0]:     return self.module(*inputs, **kwargs)  # type: ignore[index]
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 825, in forward
+[rank0]:     return model_forward(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/accelerate/utils/operations.py", line 813, in __call__
+[rank0]:     return convert_to_fp32(self.model_forward(*args, **kwargs))
+[rank0]:                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
+[rank0]:     return func(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/transformers/src/transformers/models/mistral/modeling_mistral.py", line 1184, in forward
+[rank0]:     loss = loss_fct(shift_logits, shift_labels)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1536, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1185, in forward
+[rank0]:     return F.cross_entropy(input, target, weight=self.weight,
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/functional.py", line 3088, in cross_entropy
+[rank0]:     return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.62 GiB. GPU

wandb/run-20240424_154339-mwp0iutr/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,223 @@

+absl-py==2.0.0
+accelerate==0.29.3
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.2.0
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-lru==2.0.4
+attrs==23.1.0
+audioread==3.0.1
+babel==2.14.0
+beautifulsoup4==4.12.3
+bitsandbytes==0.43.1
+bleach==6.1.0
+brotli==1.0.9
+cachetools==5.3.2
+certifi==2023.11.17
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==2.0.4
+click==8.1.7
+comm==0.2.1
+cryptography==41.0.7
+datasets==2.18.1.dev0
+debugpy==1.8.1
+decorator==5.1.1
+deepspeed==0.12.2
+defusedxml==0.7.1
+dill==0.3.7
+docker-pycreds==0.4.0
+docstring-parser==0.15
+einops==0.7.0
+evaluate==0.4.0
+executing==2.0.1
+fastjsonschema==2.19.1
+filelock==3.13.1
+flatbuffers==23.5.26
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2023.10.0
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.40
+gmpy2==2.1.2
+google-auth-oauthlib==1.2.0
+google-auth==2.26.1
+google-pasta==0.2.0
+grpcio==1.60.0
+h11==0.14.0
+h5py==3.10.0
+hf-transfer==0.1.5
+hjson==3.1.0
+httpcore==1.0.2
+httpx==0.26.0
+huggingface-hub==0.22.2
+idna==3.4
+ipdb==0.13.13
+ipykernel==6.29.2
+ipython==8.21.0
+isoduration==20.11.0
+jedi==0.19.1
+jinja2==3.1.2
+jiwer==3.0.3
+joblib==1.3.2
+json5==0.9.14
+jsonpointer==2.4
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-events==0.9.0
+jupyter-lsp==2.2.2
+jupyter-server-terminals==0.5.2
+jupyter-server==2.12.5
+jupyterlab-pygments==0.3.0
+jupyterlab-server==2.25.2
+jupyterlab==4.1.1
+keras==2.15.0
+lazy-loader==0.3
+libclang==16.0.6
+librosa==0.10.1
+llvmlite==0.41.1
+markdown-it-py==3.0.0
+markdown==3.5.1
+markupsafe==2.1.1
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+mistune==3.0.2
+mkl-fft==1.3.8
+mkl-random==1.2.4
+mkl-service==2.4.0
+ml-dtypes==0.2.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.15
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==3.1
+ninja==1.11.1.1
+nltk==3.8.1
+notebook-shim==0.2.3
+numba==0.58.1
+numpy==1.26.2
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.2
+opt-einsum==3.3.0
+overrides==7.7.0
+packaging==23.2
+pandas==2.1.4
+pandocfilters==1.5.1
+parso==0.8.3
+peft==0.7.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.1.0
+pooch==1.8.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==3.20.2
+psutil==5.9.7
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py-cpuinfo==9.0.0
+pyarrow-hotfix==0.6
+pyarrow==14.0.2
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pycparser==2.21
+pydantic-core==2.16.1
+pydantic==2.6.0
+pygments==2.17.2
+pynvml==11.5.0
+pyopenssl==23.2.0
+pysocks==1.7.1
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytorch-triton==3.0.0+989adb9a29
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+rapidfuzz==3.6.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+responses==0.18.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.7.0
+rpds-py==0.17.1
+rsa==4.9
+safetensors==0.4.1
+scikit-learn==1.3.2
+scipy==1.11.4
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==1.39.1
+setproctitle==1.3.3
+setuptools==68.2.2
+shtab==1.6.5
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+stack-data==0.6.3
+sympy==1.12
+tensorboard-data-server==0.7.2
+tensorboard==2.15.1
+tensorflow-cpu==2.15.0.post1
+tensorflow-estimator==2.15.0
+tensorflow-io-gcs-filesystem==0.35.0
+termcolor==2.4.0
+terminado==0.18.0
+threadpoolctl==3.2.0
+tinycss2==1.2.1
+tokenizers==0.15.0
+torch==2.4.0.dev20240323+cu121
+torchaudio==2.2.0.dev20240323+cu121
+torchvision==0.19.0.dev20240323+cu121
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.14.1
+transformers==4.39.0.dev0
+triton==2.2.0
+trl==0.8.6
+types-python-dateutil==2.8.19.20240106
+typing-extensions==4.10.0
+tyro==0.7.0
+tzdata==2023.3
+uri-template==1.3.0
+urllib3==1.26.18
+wandb==0.16.1
+wcwidth==0.2.13
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.7.0
+werkzeug==3.0.1
+wheel==0.41.2
+wrapt==1.14.1
+xxhash==3.4.1
+yarl==1.9.4

wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,558 @@

+{
+    "os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
+    "python": "3.11.5",
+    "heartbeatAt": "2024-04-24T15:43:39.965097",
+    "startedAt": "2024-04-24T15:43:39.449266",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "config_full.yaml"
+    ],
+    "state": "running",
+    "program": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py",
+    "codePathLocal": "run_sft.py",
+    "codePath": "run_sft.py",
+    "git": {
+        "remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat",
+        "commit": "cbea69c6b95c970317a1e47c3f614b55b33f8ed9"
+    },
+    "email": null,
+    "root": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat",
+    "host": "ip-26-0-162-233",
+    "username": "sanchit",
+    "executable": "/fsx/sanchit/miniconda3/envs/venv/bin/python",
+    "cpu_count": 96,
+    "cpu_count_logical": 96,
+    "cpu_freq": {
+        "current": 2721.9698645833337,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 3590.538,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3595.996,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3597.59,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3399.936,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3598.273,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3597.284,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3036.337,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3597.887,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3598.442,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 290.7472343444824,
+            "used": 59.263893127441406
+        }
+    },
+    "gpu": "NVIDIA H100 80GB HBM3",
+    "gpu_count": 8,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        }
+    ],
+    "memory": {
+        "total": 1999.9855270385742
+    }
+}

wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/loss": 14.0246, "train/grad_norm": 1440.0, "train/learning_rate": 2.0000000000000002e-07, "train/epoch": 0.0, "train/global_step": 1, "_timestamp": 1713973432.7827635, "_runtime": 13.312107563018799, "_step": 0, "_wandb": {"runtime": 14}}

wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,209 @@

+2024-04-24 15:43:39,468 INFO    StreamThr :1848599 [internal.py:wandb_internal():86] W&B internal server running at pid: 1848599, started at: 2024-04-24 15:43:39.467078
+2024-04-24 15:43:39,469 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status
+2024-04-24 15:43:39,473 INFO    WriterThread:1848599 [datastore.py:open_for_write():85] open: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb
+2024-04-24 15:43:39,476 DEBUG   SenderThread:1848599 [sender.py:send():382] send: header
+2024-04-24 15:43:39,521 DEBUG   SenderThread:1848599 [sender.py:send():382] send: run
+2024-04-24 15:43:39,793 INFO    SenderThread:1848599 [dir_watcher.py:__init__():211] watching files in: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files
+2024-04-24 15:43:39,793 INFO    SenderThread:1848599 [sender.py:_start_run_threads():1136] run started: mwp0iutr with start time 1713973419.470656
+2024-04-24 15:43:39,798 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: check_version
+2024-04-24 15:43:39,799 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: check_version
+2024-04-24 15:43:39,851 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: run_start
+2024-04-24 15:43:39,908 DEBUG   HandlerThread:1848599 [system_info.py:__init__():32] System info init
+2024-04-24 15:43:39,908 DEBUG   HandlerThread:1848599 [system_info.py:__init__():47] System info init done
+2024-04-24 15:43:39,908 INFO    HandlerThread:1848599 [system_monitor.py:start():194] Starting system monitor
+2024-04-24 15:43:39,908 INFO    SystemMonitor:1848599 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-04-24 15:43:39,909 INFO    HandlerThread:1848599 [system_monitor.py:probe():214] Collecting system info
+2024-04-24 15:43:39,909 INFO    SystemMonitor:1848599 [interfaces.py:start():190] Started cpu monitoring
+2024-04-24 15:43:39,909 INFO    SystemMonitor:1848599 [interfaces.py:start():190] Started disk monitoring
+2024-04-24 15:43:39,910 INFO    SystemMonitor:1848599 [interfaces.py:start():190] Started gpu monitoring
+2024-04-24 15:43:39,911 INFO    SystemMonitor:1848599 [interfaces.py:start():190] Started memory monitoring
+2024-04-24 15:43:39,911 INFO    SystemMonitor:1848599 [interfaces.py:start():190] Started network monitoring
+2024-04-24 15:43:39,965 DEBUG   HandlerThread:1848599 [system_info.py:probe():196] Probing system
+2024-04-24 15:43:39,967 DEBUG   HandlerThread:1848599 [system_info.py:_probe_git():181] Probing git
+2024-04-24 15:43:39,987 DEBUG   HandlerThread:1848599 [system_info.py:_probe_git():189] Probing git done
+2024-04-24 15:43:39,987 DEBUG   HandlerThread:1848599 [system_info.py:probe():244] Probing system done
+2024-04-24 15:43:39,987 DEBUG   HandlerThread:1848599 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-1048-aws-x86_64-with-glibc2.31', 'python': '3.11.5', 'heartbeatAt': '2024-04-24T15:43:39.965097', 'startedAt': '2024-04-24T15:43:39.449266', 'docker': None, 'cuda': None, 'args': ('config_full.yaml',), 'state': 'running', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'codePathLocal': 'run_sft.py', 'codePath': 'run_sft.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat', 'commit': 'cbea69c6b95c970317a1e47c3f614b55b33f8ed9'}, 'email': None, 'root': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat', 'host': 'ip-26-0-162-233', 'username': 'sanchit', 'executable': '/fsx/sanchit/miniconda3/envs/venv/bin/python', 'cpu_count': 96, 'cpu_count_logical': 96, 'cpu_freq': {'current': 2721.9698645833337, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 3590.538, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3595.996, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.59, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3399.936, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.273, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.284, 'min': 0.0, 'max': 0.0}, {'current': 3036.337, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3597.887, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 3598.442, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}, {'current': 2650.0, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 290.7472343444824, 'used': 59.263893127441406}}, 'gpu': 'NVIDIA H100 80GB HBM3', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}, {'name': 'NVIDIA H100 80GB HBM3', 'memory_total': 85520809984}], 'memory': {'total': 1999.9855270385742}}
+2024-04-24 15:43:39,988 INFO    HandlerThread:1848599 [system_monitor.py:probe():224] Finished collecting system info
+2024-04-24 15:43:39,988 INFO    HandlerThread:1848599 [system_monitor.py:probe():227] Publishing system info
+2024-04-24 15:43:39,988 DEBUG   HandlerThread:1848599 [system_info.py:_save_pip():52] Saving list of pip packages installed into the current environment
+2024-04-24 15:43:39,989 DEBUG   HandlerThread:1848599 [system_info.py:_save_pip():68] Saving pip packages done
+2024-04-24 15:43:39,990 DEBUG   HandlerThread:1848599 [system_info.py:_save_conda():75] Saving list of conda packages installed into the current environment
+2024-04-24 15:43:40,795 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
+2024-04-24 15:43:40,796 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt
+2024-04-24 15:43:45,799 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
+2024-04-24 15:43:45,805 DEBUG   HandlerThread:1848599 [system_info.py:_save_conda():87] Saving conda packages done
+2024-04-24 15:43:45,807 INFO    HandlerThread:1848599 [system_monitor.py:probe():229] Finished publishing system info
+2024-04-24 15:43:45,857 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
+2024-04-24 15:43:45,857 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: keepalive
+2024-04-24 15:43:45,858 DEBUG   SenderThread:1848599 [sender.py:send():382] send: files
+2024-04-24 15:43:45,858 INFO    SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-metadata.json with policy now
+2024-04-24 15:43:45,864 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: stop_status
+2024-04-24 15:43:45,865 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: stop_status
+2024-04-24 15:43:45,867 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: internal_messages
+2024-04-24 15:43:45,993 DEBUG   SenderThread:1848599 [sender.py:send():382] send: telemetry
+2024-04-24 15:43:45,993 DEBUG   SenderThread:1848599 [sender.py:send():382] send: config
+2024-04-24 15:43:45,993 DEBUG   SenderThread:1848599 [sender.py:send():382] send: metric
+2024-04-24 15:43:45,994 DEBUG   SenderThread:1848599 [sender.py:send():382] send: telemetry
+2024-04-24 15:43:45,994 DEBUG   SenderThread:1848599 [sender.py:send():382] send: metric
+2024-04-24 15:43:45,994 WARNING SenderThread:1848599 [sender.py:send_metric():1343] Seen metric with glob (shouldn't happen)
+2024-04-24 15:43:45,994 DEBUG   SenderThread:1848599 [sender.py:send():382] send: telemetry
+2024-04-24 15:43:46,179 INFO    wandb-upload_0:1848599 [upload_job.py:push():131] Uploaded file /tmp/tmphsb5r9cdwandb/sgr8lmob-wandb-metadata.json
+2024-04-24 15:43:46,800 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json
+2024-04-24 15:43:46,801 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
+2024-04-24 15:43:48,803 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
+2024-04-24 15:43:50,251 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
+2024-04-24 15:43:52,783 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: partial_history
+2024-04-24 15:43:52,785 DEBUG   SenderThread:1848599 [sender.py:send():382] send: metric
+2024-04-24 15:43:52,785 DEBUG   SenderThread:1848599 [sender.py:send():382] send: metric
+2024-04-24 15:43:52,786 DEBUG   SenderThread:1848599 [sender.py:send():382] send: metric
+2024-04-24 15:43:52,786 DEBUG   SenderThread:1848599 [sender.py:send():382] send: metric
+2024-04-24 15:43:52,786 DEBUG   SenderThread:1848599 [sender.py:send():382] send: history
+2024-04-24 15:43:52,786 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: summary_record
+2024-04-24 15:43:52,788 INFO    SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
+2024-04-24 15:43:52,807 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_created():271] file/dir created: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
+2024-04-24 15:43:54,212 DEBUG   SenderThread:1848599 [sender.py:send():382] send: exit
+2024-04-24 15:43:54,212 INFO    SenderThread:1848599 [sender.py:send_exit():589] handling exit code: 1
+2024-04-24 15:43:54,212 INFO    SenderThread:1848599 [sender.py:send_exit():591] handling runtime: 14
+2024-04-24 15:43:54,213 INFO    SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
+2024-04-24 15:43:54,213 INFO    SenderThread:1848599 [sender.py:send_exit():597] send defer
+2024-04-24 15:43:54,213 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:54,213 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 0
+2024-04-24 15:43:54,214 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:54,214 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-04-24 15:43:54,214 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 1
+2024-04-24 15:43:54,214 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:54,214 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 1
+2024-04-24 15:43:54,214 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:54,214 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-04-24 15:43:54,214 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 2
+2024-04-24 15:43:54,214 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:54,214 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 2
+2024-04-24 15:43:54,214 INFO    HandlerThread:1848599 [system_monitor.py:finish():203] Stopping system monitor
+2024-04-24 15:43:54,214 DEBUG   SystemMonitor:1848599 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-04-24 15:43:54,215 DEBUG   SystemMonitor:1848599 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-04-24 15:43:54,215 DEBUG   SystemMonitor:1848599 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-04-24 15:43:54,215 INFO    HandlerThread:1848599 [interfaces.py:finish():202] Joined cpu monitor
+2024-04-24 15:43:54,217 INFO    HandlerThread:1848599 [interfaces.py:finish():202] Joined disk monitor
+2024-04-24 15:43:54,810 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
+2024-04-24 15:43:54,810 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
+2024-04-24 15:43:56,812 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
+2024-04-24 15:43:57,141 INFO    HandlerThread:1848599 [interfaces.py:finish():202] Joined gpu monitor
+2024-04-24 15:43:57,142 INFO    HandlerThread:1848599 [interfaces.py:finish():202] Joined memory monitor
+2024-04-24 15:43:57,142 INFO    HandlerThread:1848599 [interfaces.py:finish():202] Joined network monitor
+2024-04-24 15:43:57,142 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: poll_exit
+2024-04-24 15:43:57,143 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
+2024-04-24 15:43:57,143 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:57,143 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-04-24 15:43:57,143 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 3
+2024-04-24 15:43:57,144 DEBUG   SenderThread:1848599 [sender.py:send():382] send: stats
+2024-04-24 15:43:57,144 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:57,144 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: poll_exit
+2024-04-24 15:43:57,145 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 3
+2024-04-24 15:43:57,146 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:57,146 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-04-24 15:43:57,146 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 4
+2024-04-24 15:43:57,146 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:57,146 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 4
+2024-04-24 15:43:57,147 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:57,147 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-04-24 15:43:57,147 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 5
+2024-04-24 15:43:57,147 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:57,147 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 5
+2024-04-24 15:43:57,147 DEBUG   SenderThread:1848599 [sender.py:send():382] send: summary
+2024-04-24 15:43:57,149 INFO    SenderThread:1848599 [sender.py:_save_file():1392] saving file wandb-summary.json with policy end
+2024-04-24 15:43:57,149 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:57,149 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-04-24 15:43:57,149 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 6
+2024-04-24 15:43:57,149 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:57,149 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 6
+2024-04-24 15:43:57,149 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:57,149 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-04-24 15:43:57,152 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: status_report
+2024-04-24 15:43:57,275 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 7
+2024-04-24 15:43:57,275 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:57,275 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 7
+2024-04-24 15:43:57,275 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:57,275 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-04-24 15:43:57,814 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml
+2024-04-24 15:43:57,814 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
+2024-04-24 15:43:58,791 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 8
+2024-04-24 15:43:58,792 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:58,792 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 8
+2024-04-24 15:43:58,792 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:43:58,792 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-04-24 15:43:58,792 INFO    SenderThread:1848599 [job_builder.py:build():298] Attempting to build job artifact
+2024-04-24 15:43:58,794 INFO    SenderThread:1848599 [job_builder.py:_get_source_type():428] is repo sourced job
+2024-04-24 15:43:58,815 INFO    Thread-12 :1848599 [dir_watcher.py:_on_file_modified():288] file/dir modified: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
+2024-04-24 15:43:58,832 INFO    SenderThread:1848599 [job_builder.py:build():404] adding wandb-job metadata file
+2024-04-24 15:43:58,858 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 9
+2024-04-24 15:43:58,859 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:43:58,859 DEBUG   SenderThread:1848599 [sender.py:send():382] send: artifact
+2024-04-24 15:43:58,859 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 9
+2024-04-24 15:43:59,524 INFO    wandb-upload_0:1848599 [upload_job.py:push():89] Uploaded file /admin/home/sanchit/.local/share/wandb/artifacts/staging/tmp1vajxumh
+2024-04-24 15:43:59,530 INFO    wandb-upload_1:1848599 [upload_job.py:push():89] Uploaded file /admin/home/sanchit/.local/share/wandb/artifacts/staging/tmp824ipvc5
+2024-04-24 15:44:00,093 INFO    SenderThread:1848599 [sender.py:send_artifact():1470] sent artifact job-https___huggingface.co_sanchit-gandhi_distil-zephyr-1.5b-ssft-ultrachat_run_sft.py - {'id': 'QXJ0aWZhY3Q6ODA4NTQyNDIx', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjE2NjI0NzU4Nw==', 'latestArtifact': None}}
+2024-04-24 15:44:00,093 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:44:00,093 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-04-24 15:44:00,093 INFO    SenderThread:1848599 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-04-24 15:44:00,213 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: keepalive
+2024-04-24 15:44:00,816 INFO    SenderThread:1848599 [dir_watcher.py:finish():388] scan: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files
+2024-04-24 15:44:00,817 INFO    SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml conda-environment.yaml
+2024-04-24 15:44:00,817 INFO    SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json wandb-summary.json
+2024-04-24 15:44:00,817 INFO    SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log output.log
+2024-04-24 15:44:00,821 INFO    SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml config.yaml
+2024-04-24 15:44:00,824 INFO    SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt requirements.txt
+2024-04-24 15:44:00,826 INFO    SenderThread:1848599 [dir_watcher.py:finish():402] scan save: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-metadata.json wandb-metadata.json
+2024-04-24 15:44:00,826 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 10
+2024-04-24 15:44:00,828 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:44:00,828 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 10
+2024-04-24 15:44:00,828 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:44:00,828 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-04-24 15:44:00,828 INFO    SenderThread:1848599 [file_pusher.py:finish():175] shutting down file pusher
+2024-04-24 15:44:01,006 INFO    wandb-upload_0:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/conda-environment.yaml
+2024-04-24 15:44:01,059 INFO    wandb-upload_1:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/wandb-summary.json
+2024-04-24 15:44:01,161 INFO    wandb-upload_2:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/output.log
+2024-04-24 15:44:01,169 INFO    wandb-upload_3:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/config.yaml
+2024-04-24 15:44:01,184 INFO    wandb-upload_4:1848599 [upload_job.py:push():131] Uploaded file /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/files/requirements.txt
+2024-04-24 15:44:01,384 INFO    Thread-11 (_thread_body):1848599 [sender.py:transition_state():617] send defer: 11
+2024-04-24 15:44:01,385 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:44:01,385 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 11
+2024-04-24 15:44:01,385 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:44:01,385 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-04-24 15:44:01,385 INFO    SenderThread:1848599 [file_pusher.py:join():181] waiting for file pusher
+2024-04-24 15:44:01,385 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 12
+2024-04-24 15:44:01,385 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:44:01,385 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 12
+2024-04-24 15:44:01,385 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:44:01,385 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-04-24 15:44:01,386 INFO    SenderThread:1848599 [file_stream.py:finish():595] file stream finish called
+2024-04-24 15:44:01,445 INFO    SenderThread:1848599 [file_stream.py:finish():599] file stream finish is done
+2024-04-24 15:44:01,445 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 13
+2024-04-24 15:44:01,445 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:44:01,445 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 13
+2024-04-24 15:44:01,445 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:44:01,445 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-04-24 15:44:01,445 INFO    SenderThread:1848599 [sender.py:transition_state():617] send defer: 14
+2024-04-24 15:44:01,446 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: defer
+2024-04-24 15:44:01,446 DEBUG   SenderThread:1848599 [sender.py:send():382] send: final
+2024-04-24 15:44:01,446 INFO    HandlerThread:1848599 [handler.py:handle_request_defer():172] handle defer: 14
+2024-04-24 15:44:01,446 DEBUG   SenderThread:1848599 [sender.py:send():382] send: footer
+2024-04-24 15:44:01,446 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: defer
+2024-04-24 15:44:01,446 INFO    SenderThread:1848599 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-04-24 15:44:01,447 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: poll_exit
+2024-04-24 15:44:01,447 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: poll_exit
+2024-04-24 15:44:01,447 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: server_info
+2024-04-24 15:44:01,447 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: get_summary
+2024-04-24 15:44:01,448 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: server_info
+2024-04-24 15:44:01,449 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: sampled_history
+2024-04-24 15:44:01,449 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: internal_messages
+2024-04-24 15:44:01,450 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: job_info
+2024-04-24 15:44:01,507 DEBUG   SenderThread:1848599 [sender.py:send_request():409] send_request: job_info
+2024-04-24 15:44:01,508 INFO    MainThread:1848599 [wandb_run.py:_footer_history_summary_info():3837] rendering history
+2024-04-24 15:44:01,508 INFO    MainThread:1848599 [wandb_run.py:_footer_history_summary_info():3869] rendering summary
+2024-04-24 15:44:01,508 INFO    MainThread:1848599 [wandb_run.py:_footer_sync_info():3796] logging synced files
+2024-04-24 15:44:01,508 DEBUG   HandlerThread:1848599 [handler.py:handle_request():146] handle_request: shutdown
+2024-04-24 15:44:01,508 INFO    HandlerThread:1848599 [handler.py:finish():866] shutting down handler
+2024-04-24 15:44:02,450 INFO    WriterThread:1848599 [datastore.py:close():294] close: /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb
+2024-04-24 15:44:02,508 INFO    SenderThread:1848599 [sender.py:finish():1548] shutting down sender
+2024-04-24 15:44:02,508 INFO    SenderThread:1848599 [file_pusher.py:finish():175] shutting down file pusher
+2024-04-24 15:44:02,508 INFO    SenderThread:1848599 [file_pusher.py:join():181] waiting for file pusher

wandb/run-20240424_154339-mwp0iutr/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-04-24 15:43:39,459 INFO    MainThread:1840687 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
+2024-04-24 15:43:39,459 INFO    MainThread:1840687 [wandb_setup.py:_flush():76] Configure stats pid to 1840687
+2024-04-24 15:43:39,459 INFO    MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
+2024-04-24 15:43:39,459 INFO    MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
+2024-04-24 15:43:39,459 INFO    MainThread:1840687 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2024-04-24 15:43:39,460 INFO    MainThread:1840687 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-04-24 15:43:39,460 INFO    MainThread:1840687 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
+2024-04-24 15:43:39,460 INFO    MainThread:1840687 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/logs/debug.log
+2024-04-24 15:43:39,460 INFO    MainThread:1840687 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_154339-mwp0iutr/logs/debug-internal.log
+2024-04-24 15:43:39,460 INFO    MainThread:1840687 [wandb_init.py:init():564] calling init triggers
+2024-04-24 15:43:39,460 INFO    MainThread:1840687 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
+config: {}
+2024-04-24 15:43:39,460 INFO    MainThread:1840687 [wandb_init.py:init():614] starting backend
+2024-04-24 15:43:39,460 INFO    MainThread:1840687 [wandb_init.py:init():618] setting up manager
+2024-04-24 15:43:39,465 INFO    MainThread:1840687 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-04-24 15:43:39,470 INFO    MainThread:1840687 [wandb_init.py:init():624] backend started and connected
+2024-04-24 15:43:39,472 INFO    MainThread:1840687 [wandb_init.py:init():716] updated telemetry
+2024-04-24 15:43:39,520 INFO    MainThread:1840687 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
+2024-04-24 15:43:39,798 INFO    MainThread:1840687 [wandb_run.py:_on_init():2254] communicating current version
+2024-04-24 15:43:39,844 INFO    MainThread:1840687 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-04-24 15:43:39,844 INFO    MainThread:1840687 [wandb_init.py:init():800] starting run threads in backend
+2024-04-24 15:43:45,864 INFO    MainThread:1840687 [wandb_run.py:_console_start():2233] atexit reg
+2024-04-24 15:43:45,864 INFO    MainThread:1840687 [wandb_run.py:_redirect():2088] redirect: wrap_raw
+2024-04-24 15:43:45,864 INFO    MainThread:1840687 [wandb_run.py:_redirect():2153] Wrapping output streams.
+2024-04-24 15:43:45,864 INFO    MainThread:1840687 [wandb_run.py:_redirect():2178] Redirects installed.
+2024-04-24 15:43:45,866 INFO    MainThread:1840687 [wandb_init.py:init():841] run started, returning control to user process
+2024-04-24 15:43:45,867 INFO    MainThread:1840687 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 64, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_14-23-38_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}
+2024-04-24 15:44:02,589 WARNING MsgRouterThr:1840687 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240424_154339-mwp0iutr/run-mwp0iutr.wandb ADDED Viewed

Binary file (26.4 kB). View file

wandb/run-20240424_164324-xfbnm7qo/files/conda-environment.yaml ADDED Viewed

	@@ -0,0 +1,300 @@

+name: venv
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py311h6a678d5_7
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.12.12=h06a4308_0
+  - certifi=2023.11.17=py311h06a4308_0
+  - cffi=1.16.0=py311h5eee18b_0
+  - cryptography=41.0.7=py311hdda0065_0
+  - cuda-cudart=12.1.105=0
+  - cuda-cupti=12.1.105=0
+  - cuda-libraries=12.1.0=0
+  - cuda-nvrtc=12.1.105=0
+  - cuda-nvtx=12.1.105=0
+  - cuda-opencl=12.3.101=0
+  - cuda-runtime=12.1.0=0
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py311h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.1=h5eee18b_3
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py311hc9b5ff0_0
+  - gnutls=3.6.15=he1e5248_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.2=py311h06a4308_0
+  - jpeg=9e=h5eee18b_1
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=12.1.0.26=0
+  - libcufft=11.0.2.4=0
+  - libcufile=1.8.1.2=0
+  - libcurand=10.3.4.101=0
+  - libcusolver=11.4.4.55=0
+  - libcusparse=12.0.2.55=0
+  - libdeflate=1.17=h5eee18b_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.0.2.50=0
+  - libnvjitlink=12.1.105=0
+  - libnvjpeg=12.1.1.14=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp=1.3.2=h11a3e52_0
+  - libwebp-base=1.3.2=h5eee18b_0
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_0
+  - markupsafe=2.1.1=py311h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py311h5eee18b_1
+  - mkl_fft=1.3.8=py311h5eee18b_0
+  - mkl_random=1.2.4=py311hdb19cb5_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py311h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.1=py311h06a4308_0
+  - numpy=1.26.2=py311h08b1b3b_0
+  - numpy-base=1.26.2=py311hf175353_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.4.0=h3ad879b_0
+  - openssl=3.0.12=h7f8727e_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=23.2.0=py311h06a4308_0
+  - pysocks=1.7.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - pytorch-cuda=12.1=ha16c6d3_5
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.1=py311h5eee18b_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.31.0=py311h06a4308_0
+  - setuptools=68.2.2=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - sympy=1.12=py311h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.12=h1ccaba5_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.5=h5eee18b_0
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.5=hc292b87_0
+  - pip:
+      - absl-py==2.0.0
+      - accelerate==0.29.3
+      - aiohttp==3.9.1
+      - aiosignal==1.3.1
+      - annotated-types==0.6.0
+      - anyio==4.2.0
+      - appdirs==1.4.4
+      - argon2-cffi==23.1.0
+      - argon2-cffi-bindings==21.2.0
+      - arrow==1.3.0
+      - asttokens==2.4.1
+      - astunparse==1.6.3
+      - async-lru==2.0.4
+      - attrs==23.1.0
+      - audioread==3.0.1
+      - babel==2.14.0
+      - beautifulsoup4==4.12.3
+      - bitsandbytes==0.43.1
+      - bleach==6.1.0
+      - cachetools==5.3.2
+      - chardet==5.2.0
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - comm==0.2.1
+      - datasets==2.18.1.dev0
+      - debugpy==1.8.1
+      - decorator==5.1.1
+      - deepspeed==0.12.2
+      - defusedxml==0.7.1
+      - dill==0.3.7
+      - docker-pycreds==0.4.0
+      - docstring-parser==0.15
+      - einops==0.7.0
+      - evaluate==0.4.0
+      - executing==2.0.1
+      - fastjsonschema==2.19.1
+      - flatbuffers==23.5.26
+      - fqdn==1.5.1
+      - frozenlist==1.4.1
+      - fsspec==2023.10.0
+      - gast==0.5.4
+      - gitdb==4.0.11
+      - gitpython==3.1.40
+      - google-auth==2.26.1
+      - google-auth-oauthlib==1.2.0
+      - google-pasta==0.2.0
+      - grpcio==1.60.0
+      - h11==0.14.0
+      - h5py==3.10.0
+      - hf-transfer==0.1.5
+      - hjson==3.1.0
+      - httpcore==1.0.2
+      - httpx==0.26.0
+      - huggingface-hub==0.22.2
+      - idna==3.6
+      - ipdb==0.13.13
+      - ipykernel==6.29.2
+      - ipython==8.21.0
+      - isoduration==20.11.0
+      - jedi==0.19.1
+      - jiwer==3.0.3
+      - joblib==1.3.2
+      - json5==0.9.14
+      - jsonpointer==2.4
+      - jsonschema==4.21.1
+      - jsonschema-specifications==2023.12.1
+      - jupyter-client==8.6.0
+      - jupyter-core==5.7.1
+      - jupyter-events==0.9.0
+      - jupyter-lsp==2.2.2
+      - jupyter-server==2.12.5
+      - jupyter-server-terminals==0.5.2
+      - jupyterlab==4.1.1
+      - jupyterlab-pygments==0.3.0
+      - jupyterlab-server==2.25.2
+      - keras==2.15.0
+      - lazy-loader==0.3
+      - libclang==16.0.6
+      - librosa==0.10.1
+      - llvmlite==0.41.1
+      - markdown==3.5.1
+      - markdown-it-py==3.0.0
+      - matplotlib-inline==0.1.6
+      - mdurl==0.1.2
+      - mistune==3.0.2
+      - ml-dtypes==0.2.0
+      - msgpack==1.0.7
+      - multidict==6.0.4
+      - multiprocess==0.70.15
+      - nbclient==0.9.0
+      - nbconvert==7.16.0
+      - nbformat==5.9.2
+      - nest-asyncio==1.6.0
+      - ninja==1.11.1.1
+      - nltk==3.8.1
+      - notebook-shim==0.2.3
+      - numba==0.58.1
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.20.5
+      - nvidia-nvjitlink-cu12==12.3.101
+      - nvidia-nvtx-cu12==12.1.105
+      - oauthlib==3.2.2
+      - opt-einsum==3.3.0
+      - overrides==7.7.0
+      - packaging==23.2
+      - pandas==2.1.4
+      - pandocfilters==1.5.1
+      - parso==0.8.3
+      - peft==0.7.1
+      - pexpect==4.9.0
+      - pillow==10.2.0
+      - pip==24.0
+      - platformdirs==4.1.0
+      - pooch==1.8.0
+      - prometheus-client==0.19.0
+      - prompt-toolkit==3.0.43
+      - protobuf==3.20.2
+      - psutil==5.9.7
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - py-cpuinfo==9.0.0
+      - pyarrow==14.0.2
+      - pyarrow-hotfix==0.6
+      - pyasn1==0.5.1
+      - pyasn1-modules==0.3.0
+      - pydantic==2.6.0
+      - pydantic-core==2.16.1
+      - pygments==2.17.2
+      - pynvml==11.5.0
+      - python-dateutil==2.8.2
+      - python-json-logger==2.0.7
+      - pytorch-triton==3.0.0+989adb9a29
+      - pytz==2023.3.post1
+      - pyzmq==25.1.2
+      - rapidfuzz==3.6.1
+      - referencing==0.33.0
+      - regex==2023.12.25
+      - requests-oauthlib==1.3.1
+      - responses==0.18.0
+      - rfc3339-validator==0.1.4
+      - rfc3986-validator==0.1.1
+      - rich==13.7.0
+      - rpds-py==0.17.1
+      - rsa==4.9
+      - safetensors==0.4.1
+      - scikit-learn==1.3.2
+      - scipy==1.11.4
+      - send2trash==1.8.2
+      - sentencepiece==0.1.99
+      - sentry-sdk==1.39.1
+      - setproctitle==1.3.3
+      - shtab==1.6.5
+      - six==1.16.0
+      - smmap==5.0.1
+      - sniffio==1.3.0
+      - soundfile==0.12.1
+      - soupsieve==2.5
+      - soxr==0.3.7
+      - stack-data==0.6.3
+      - tensorboard==2.15.1
+      - tensorboard-data-server==0.7.2
+      - tensorflow-cpu==2.15.0.post1
+      - tensorflow-estimator==2.15.0
+      - tensorflow-io-gcs-filesystem==0.35.0
+      - termcolor==2.4.0
+      - terminado==0.18.0
+      - threadpoolctl==3.2.0
+      - tinycss2==1.2.1
+      - tokenizers==0.15.0
+      - torch==2.4.0.dev20240323+cu121
+      - torchaudio==2.2.0.dev20240323+cu121
+      - torchvision==0.19.0.dev20240323+cu121
+      - tornado==6.4
+      - tqdm==4.66.1
+      - traitlets==5.14.1
+      - transformers==4.39.0.dev0
+      - triton==2.2.0
+      - trl==0.8.6
+      - types-python-dateutil==2.8.19.20240106
+      - typing-extensions==4.9.0
+      - tyro==0.7.0
+      - tzdata==2023.3
+      - uri-template==1.3.0
+      - urllib3==2.1.0
+      - wandb==0.16.1
+      - wcwidth==0.2.13
+      - webcolors==1.13
+      - webencodings==0.5.1
+      - websocket-client==1.7.0
+      - werkzeug==3.0.1
+      - wrapt==1.14.1
+      - xxhash==3.4.1
+      - yarl==1.9.4
+prefix: /fsx/sanchit/miniconda3/envs/venv

wandb/run-20240424_164324-xfbnm7qo/files/config.yaml ADDED Viewed

	@@ -0,0 +1,663 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    python_version: 3.11.5
+    cli_version: 0.16.1
+    framework: huggingface
+    huggingface_version: 4.40.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1713977004.542006
+    t:
+      1:
+      - 1
+      - 2
+      - 3
+      - 5
+      - 11
+      - 49
+      - 51
+      - 53
+      - 55
+      - 71
+      - 84
+      - 98
+      2:
+      - 1
+      - 2
+      - 3
+      - 5
+      - 11
+      - 49
+      - 51
+      - 53
+      - 55
+      - 71
+      - 84
+      - 98
+      3:
+      - 7
+      - 23
+      4: 3.11.5
+      5: 0.16.1
+      6: 4.40.0.dev0
+      8:
+      - 5
+      9:
+        1: transformers_trainer
+      13: linux-x86_64
+    m:
+    - 1: train/global_step
+      6:
+      - 3
+    - 1: train/loss
+      5: 1
+      6:
+      - 1
+    - 1: train/grad_norm
+      5: 1
+      6:
+      - 1
+    - 1: train/learning_rate
+      5: 1
+      6:
+      - 1
+    - 1: train/epoch
+      5: 1
+      6:
+      - 1
+vocab_size:
+  desc: null
+  value: 32000
+max_position_embeddings:
+  desc: null
+  value: 32768
+hidden_size:
+  desc: null
+  value: 4096
+intermediate_size:
+  desc: null
+  value: 14336
+num_hidden_layers:
+  desc: null
+  value: 6
+num_attention_heads:
+  desc: null
+  value: 32
+sliding_window:
+  desc: null
+  value: 4096
+num_key_value_heads:
+  desc: null
+  value: 8
+hidden_act:
+  desc: null
+  value: silu
+initializer_range:
+  desc: null
+  value: 0.02
+rms_norm_eps:
+  desc: null
+  value: 1.0e-05
+use_cache:
+  desc: null
+  value: false
+rope_theta:
+  desc: null
+  value: 10000.0
+attention_dropout:
+  desc: null
+  value: 0.0
+return_dict:
+  desc: null
+  value: true
+output_hidden_states:
+  desc: null
+  value: false
+output_attentions:
+  desc: null
+  value: false
+torchscript:
+  desc: null
+  value: false
+torch_dtype:
+  desc: null
+  value: bfloat16
+use_bfloat16:
+  desc: null
+  value: false
+tf_legacy_loss:
+  desc: null
+  value: false
+pruned_heads:
+  desc: null
+  value: {}
+tie_word_embeddings:
+  desc: null
+  value: false
+chunk_size_feed_forward:
+  desc: null
+  value: 0
+is_encoder_decoder:
+  desc: null
+  value: false
+is_decoder:
+  desc: null
+  value: false
+cross_attention_hidden_size:
+  desc: null
+  value: null
+add_cross_attention:
+  desc: null
+  value: false
+tie_encoder_decoder:
+  desc: null
+  value: false
+max_length:
+  desc: null
+  value: 20
+min_length:
+  desc: null
+  value: 0
+do_sample:
+  desc: null
+  value: false
+early_stopping:
+  desc: null
+  value: false
+num_beams:
+  desc: null
+  value: 1
+num_beam_groups:
+  desc: null
+  value: 1
+diversity_penalty:
+  desc: null
+  value: 0.0
+temperature:
+  desc: null
+  value: 1.0
+top_k:
+  desc: null
+  value: 50
+top_p:
+  desc: null
+  value: 1.0
+typical_p:
+  desc: null
+  value: 1.0
+repetition_penalty:
+  desc: null
+  value: 1.0
+length_penalty:
+  desc: null
+  value: 1.0
+no_repeat_ngram_size:
+  desc: null
+  value: 0
+encoder_no_repeat_ngram_size:
+  desc: null
+  value: 0
+bad_words_ids:
+  desc: null
+  value: null
+num_return_sequences:
+  desc: null
+  value: 1
+output_scores:
+  desc: null
+  value: false
+return_dict_in_generate:
+  desc: null
+  value: false
+forced_bos_token_id:
+  desc: null
+  value: null
+forced_eos_token_id:
+  desc: null
+  value: null
+remove_invalid_values:
+  desc: null
+  value: false
+exponential_decay_length_penalty:
+  desc: null
+  value: null
+suppress_tokens:
+  desc: null
+  value: null
+begin_suppress_tokens:
+  desc: null
+  value: null
+architectures:
+  desc: null
+  value:
+  - MistralForCausalLM
+finetuning_task:
+  desc: null
+  value: null
+id2label:
+  desc: null
+  value:
+    '0': LABEL_0
+    '1': LABEL_1
+label2id:
+  desc: null
+  value:
+    LABEL_0: 0
+    LABEL_1: 1
+tokenizer_class:
+  desc: null
+  value: null
+prefix:
+  desc: null
+  value: null
+bos_token_id:
+  desc: null
+  value: 1
+pad_token_id:
+  desc: null
+  value: null
+eos_token_id:
+  desc: null
+  value: 2
+sep_token_id:
+  desc: null
+  value: null
+decoder_start_token_id:
+  desc: null
+  value: null
+task_specific_params:
+  desc: null
+  value: null
+problem_type:
+  desc: null
+  value: null
+_name_or_path:
+  desc: null
+  value: sanchit-gandhi/Mistral-7B-v0.1-6-layer
+transformers_version:
+  desc: null
+  value: 4.40.0.dev0
+model_type:
+  desc: null
+  value: mistral
+output_dir:
+  desc: null
+  value: ./
+overwrite_output_dir:
+  desc: null
+  value: true
+do_train:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: true
+do_predict:
+  desc: null
+  value: false
+evaluation_strategy:
+  desc: null
+  value: steps
+prediction_loss_only:
+  desc: null
+  value: false
+per_device_train_batch_size:
+  desc: null
+  value: 32
+per_device_eval_batch_size:
+  desc: null
+  value: 32
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+gradient_accumulation_steps:
+  desc: null
+  value: 1
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_delay:
+  desc: null
+  value: 0
+learning_rate:
+  desc: null
+  value: 0.0001
+weight_decay:
+  desc: null
+  value: 0.0
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.999
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+max_grad_norm:
+  desc: null
+  value: 1.0
+num_train_epochs:
+  desc: null
+  value: 3.0
+max_steps:
+  desc: null
+  value: 20000
+lr_scheduler_type:
+  desc: null
+  value: linear
+lr_scheduler_kwargs:
+  desc: null
+  value: {}
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 500
+log_level:
+  desc: null
+  value: info
+log_level_replica:
+  desc: null
+  value: warning
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ./runs/Apr24_16-42-31_ip-26-0-162-233
+logging_strategy:
+  desc: null
+  value: steps
+logging_first_step:
+  desc: null
+  value: true
+logging_steps:
+  desc: null
+  value: 25
+logging_nan_inf_filter:
+  desc: null
+  value: true
+save_strategy:
+  desc: null
+  value: steps
+save_steps:
+  desc: null
+  value: 500
+save_total_limit:
+  desc: null
+  value: 5000
+save_safetensors:
+  desc: null
+  value: true
+save_on_each_node:
+  desc: null
+  value: false
+save_only_model:
+  desc: null
+  value: false
+no_cuda:
+  desc: null
+  value: false
+use_cpu:
+  desc: null
+  value: false
+use_mps_device:
+  desc: null
+  value: false
+seed:
+  desc: null
+  value: 42
+data_seed:
+  desc: null
+  value: null
+jit_mode_eval:
+  desc: null
+  value: false
+use_ipex:
+  desc: null
+  value: false
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+half_precision_backend:
+  desc: null
+  value: auto
+bf16_full_eval:
+  desc: null
+  value: false
+fp16_full_eval:
+  desc: null
+  value: false
+tf32:
+  desc: null
+  value: null
+local_rank:
+  desc: null
+  value: 0
+ddp_backend:
+  desc: null
+  value: null
+tpu_num_cores:
+  desc: null
+  value: null
+tpu_metrics_debug:
+  desc: null
+  value: false
+debug:
+  desc: null
+  value: []
+dataloader_drop_last:
+  desc: null
+  value: false
+eval_steps:
+  desc: null
+  value: 5000
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_prefetch_factor:
+  desc: null
+  value: null
+past_index:
+  desc: null
+  value: -1
+run_name:
+  desc: null
+  value: ./
+disable_tqdm:
+  desc: null
+  value: false
+remove_unused_columns:
+  desc: null
+  value: true
+label_names:
+  desc: null
+  value: null
+load_best_model_at_end:
+  desc: null
+  value: false
+metric_for_best_model:
+  desc: null
+  value: null
+greater_is_better:
+  desc: null
+  value: null
+ignore_data_skip:
+  desc: null
+  value: false
+fsdp:
+  desc: null
+  value: []
+fsdp_min_num_params:
+  desc: null
+  value: 0
+fsdp_config:
+  desc: null
+  value:
+    min_num_params: 0
+    xla: false
+    xla_fsdp_v2: false
+    xla_fsdp_grad_ckpt: false
+fsdp_transformer_layer_cls_to_wrap:
+  desc: null
+  value: null
+accelerator_config:
+  desc: null
+  value:
+    split_batches: false
+    dispatch_batches: null
+    even_batches: true
+    use_seedable_sampler: true
+    gradient_accumulation_kwargs: null
+deepspeed:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+optim:
+  desc: null
+  value: adamw_torch
+optim_args:
+  desc: null
+  value: null
+adafactor:
+  desc: null
+  value: false
+group_by_length:
+  desc: null
+  value: false
+length_column_name:
+  desc: null
+  value: length
+report_to:
+  desc: null
+  value:
+  - tensorboard
+  - wandb
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+ddp_bucket_cap_mb:
+  desc: null
+  value: null
+ddp_broadcast_buffers:
+  desc: null
+  value: null
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataloader_persistent_workers:
+  desc: null
+  value: false
+skip_memory_metrics:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+push_to_hub:
+  desc: null
+  value: true
+resume_from_checkpoint:
+  desc: null
+  value: null
+hub_model_id:
+  desc: null
+  value: null
+hub_strategy:
+  desc: null
+  value: every_save
+hub_token:
+  desc: null
+  value: <HUB_TOKEN>
+hub_private_repo:
+  desc: null
+  value: false
+hub_always_push:
+  desc: null
+  value: false
+gradient_checkpointing:
+  desc: null
+  value: true
+gradient_checkpointing_kwargs:
+  desc: null
+  value:
+    use_reentrant: false
+include_inputs_for_metrics:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+push_to_hub_model_id:
+  desc: null
+  value: null
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: <PUSH_TO_HUB_TOKEN>
+mp_parameters:
+  desc: null
+  value: ''
+auto_find_batch_size:
+  desc: null
+  value: false
+full_determinism:
+  desc: null
+  value: false
+torchdynamo:
+  desc: null
+  value: null
+ray_scope:
+  desc: null
+  value: last
+ddp_timeout:
+  desc: null
+  value: 7200
+torch_compile:
+  desc: null
+  value: false
+torch_compile_backend:
+  desc: null
+  value: null
+torch_compile_mode:
+  desc: null
+  value: null
+dispatch_batches:
+  desc: null
+  value: null
+split_batches:
+  desc: null
+  value: null
+include_tokens_per_second:
+  desc: null
+  value: false
+include_num_input_tokens_seen:
+  desc: null
+  value: false
+neftune_noise_alpha:
+  desc: null
+  value: null
+optim_target_modules:
+  desc: null
+  value: null
+max_seq_length:
+  desc: null
+  value: 2048

wandb/run-20240424_164324-xfbnm7qo/files/output.log ADDED Viewed

	@@ -0,0 +1,522 @@

+  0%|                                                                                         | 0/20000 [00:00<?, ?it/s]/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
+  warnings.warn(
+  0%|                                                                              | 1/20000 [00:03<16:45:22,  3.02s/it]
+  0%|                                                                             | 25/20000 [00:49<10:30:28,  1.89s/it]
+  0%|▏                                                                            | 50/20000 [01:36<10:30:25,  1.90s/it]
+  0%|▎                                                                            | 75/20000 [02:23<10:25:09,  1.88s/it]
+  0%|▍                                                                           | 100/20000 [03:10<10:21:12,  1.87s/it]
+  1%|▍                                                                           | 125/20000 [03:57<10:17:00,  1.86s/it]
+  1%|▌                                                                           | 150/20000 [04:43<10:12:27,  1.85s/it]
+  1%|▋                                                                           | 174/20000 [05:28<10:11:10,  1.85s/it]
+  1%|▊                                                                           | 199/20000 [06:14<10:07:08,  1.84s/it]
+  1%|▊                                                                           | 225/20000 [07:02<10:09:09,  1.85s/it]
+  1%|▉                                                                           | 250/20000 [07:48<10:05:59,  1.84s/it]
+  1%|█                                                                           | 274/20000 [08:32<10:02:25,  1.83s/it]
+  1%|█▏                                                                          | 299/20000 [09:18<10:01:03,  1.83s/it]
+  2%|█▏                                                                          | 324/20000 [10:04<10:02:02,  1.84s/it]
+  2%|█▎                                                                           | 350/20000 [10:52<9:54:57,  1.82s/it]
+  2%|█▍                                                                          | 375/20000 [11:37<10:00:16,  1.84s/it]
+  2%|█▌                                                                           | 400/20000 [12:23<9:56:59,  1.83s/it]
+  2%|█▋                                                                           | 425/20000 [13:09<9:54:49,  1.82s/it]
+  2%|█▋                                                                           | 450/20000 [13:54<9:56:31,  1.83s/it]
+  2%|█▊                                                                           | 474/20000 [14:38<9:55:46,  1.83s/it]
+  2%|█▉                                                                           | 500/20000 [15:26<9:52:31,  1.82s/it][INFO|trainer.py:3304] 2024-04-24 16:58:56,780 >> Saving model checkpoint to ./checkpoint-500
+[INFO|configuration_utils.py:471] 2024-04-24 16:58:56,784 >> Configuration saved in ./checkpoint-500/config.json
+[INFO|configuration_utils.py:697] 2024-04-24 16:58:56,788 >> Configuration saved in ./checkpoint-500/generation_config.json
+{'loss': 2.0773, 'grad_norm': 4.6875, 'learning_rate': 0.0001, 'epoch': 0.12}
+[INFO|modeling_utils.py:2590] 2024-04-24 16:59:01,066 >> Model weights saved in ./checkpoint-500/model.safetensors
+[INFO|tokenization_utils_base.py:2488] 2024-04-24 16:59:01,079 >> tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2497] 2024-04-24 16:59:01,081 >> Special tokens file saved in ./checkpoint-500/special_tokens_map.json
+[INFO|tokenization_utils_base.py:2488] 2024-04-24 16:59:11,382 >> tokenizer config file saved in ./tokenizer_config.json
+[INFO|tokenization_utils_base.py:2497] 2024-04-24 16:59:11,384 >> Special tokens file saved in ./special_tokens_map.json
+/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:144: UserWarning: Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. Device state will only be saved for devices of a single device type, and the remaining devices will be ignored. Consequently, if any checkpointed functions involve randomness, this may result in incorrect gradients. (Note that if CUDA devices are among the devices detected, it will be prioritized; otherwise, the first device encountered will be selected.)
+  warnings.warn(
+  3%|██                                                                           | 524/20000 [16:24<9:52:57,  1.83s/it]

wandb/run-20240424_164324-xfbnm7qo/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,223 @@

+absl-py==2.0.0
+accelerate==0.29.3
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.2.0
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-lru==2.0.4
+attrs==23.1.0
+audioread==3.0.1
+babel==2.14.0
+beautifulsoup4==4.12.3
+bitsandbytes==0.43.1
+bleach==6.1.0
+brotli==1.0.9
+cachetools==5.3.2
+certifi==2023.11.17
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==2.0.4
+click==8.1.7
+comm==0.2.1
+cryptography==41.0.7
+datasets==2.18.1.dev0
+debugpy==1.8.1
+decorator==5.1.1
+deepspeed==0.12.2
+defusedxml==0.7.1
+dill==0.3.7
+docker-pycreds==0.4.0
+docstring-parser==0.15
+einops==0.7.0
+evaluate==0.4.0
+executing==2.0.1
+fastjsonschema==2.19.1
+filelock==3.13.1
+flatbuffers==23.5.26
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2023.10.0
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.40
+gmpy2==2.1.2
+google-auth-oauthlib==1.2.0
+google-auth==2.26.1
+google-pasta==0.2.0
+grpcio==1.60.0
+h11==0.14.0
+h5py==3.10.0
+hf-transfer==0.1.5
+hjson==3.1.0
+httpcore==1.0.2
+httpx==0.26.0
+huggingface-hub==0.22.2
+idna==3.4
+ipdb==0.13.13
+ipykernel==6.29.2
+ipython==8.21.0
+isoduration==20.11.0
+jedi==0.19.1
+jinja2==3.1.2
+jiwer==3.0.3
+joblib==1.3.2
+json5==0.9.14
+jsonpointer==2.4
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-events==0.9.0
+jupyter-lsp==2.2.2
+jupyter-server-terminals==0.5.2
+jupyter-server==2.12.5
+jupyterlab-pygments==0.3.0
+jupyterlab-server==2.25.2
+jupyterlab==4.1.1
+keras==2.15.0
+lazy-loader==0.3
+libclang==16.0.6
+librosa==0.10.1
+llvmlite==0.41.1
+markdown-it-py==3.0.0
+markdown==3.5.1
+markupsafe==2.1.1
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+mistune==3.0.2
+mkl-fft==1.3.8
+mkl-random==1.2.4
+mkl-service==2.4.0
+ml-dtypes==0.2.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.15
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==3.1
+ninja==1.11.1.1
+nltk==3.8.1
+notebook-shim==0.2.3
+numba==0.58.1
+numpy==1.26.2
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.2
+opt-einsum==3.3.0
+overrides==7.7.0
+packaging==23.2
+pandas==2.1.4
+pandocfilters==1.5.1
+parso==0.8.3
+peft==0.7.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.1.0
+pooch==1.8.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==3.20.2
+psutil==5.9.7
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py-cpuinfo==9.0.0
+pyarrow-hotfix==0.6
+pyarrow==14.0.2
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pycparser==2.21
+pydantic-core==2.16.1
+pydantic==2.6.0
+pygments==2.17.2
+pynvml==11.5.0
+pyopenssl==23.2.0
+pysocks==1.7.1
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytorch-triton==3.0.0+989adb9a29
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+rapidfuzz==3.6.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+responses==0.18.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.7.0
+rpds-py==0.17.1
+rsa==4.9
+safetensors==0.4.1
+scikit-learn==1.3.2
+scipy==1.11.4
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==1.39.1
+setproctitle==1.3.3
+setuptools==68.2.2
+shtab==1.6.5
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+stack-data==0.6.3
+sympy==1.12
+tensorboard-data-server==0.7.2
+tensorboard==2.15.1
+tensorflow-cpu==2.15.0.post1
+tensorflow-estimator==2.15.0
+tensorflow-io-gcs-filesystem==0.35.0
+termcolor==2.4.0
+terminado==0.18.0
+threadpoolctl==3.2.0
+tinycss2==1.2.1
+tokenizers==0.15.0
+torch==2.4.0.dev20240323+cu121
+torchaudio==2.2.0.dev20240323+cu121
+torchvision==0.19.0.dev20240323+cu121
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.14.1
+transformers==4.39.0.dev0
+triton==2.2.0
+trl==0.8.6
+types-python-dateutil==2.8.19.20240106
+typing-extensions==4.10.0
+tyro==0.7.0
+tzdata==2023.3
+uri-template==1.3.0
+urllib3==1.26.18
+wandb==0.16.1
+wcwidth==0.2.13
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.7.0
+werkzeug==3.0.1
+wheel==0.41.2
+wrapt==1.14.1
+xxhash==3.4.1
+yarl==1.9.4

wandb/run-20240424_164324-xfbnm7qo/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,558 @@

+{
+    "os": "Linux-5.15.0-1048-aws-x86_64-with-glibc2.31",
+    "python": "3.11.5",
+    "heartbeatAt": "2024-04-24T16:43:25.058035",
+    "startedAt": "2024-04-24T16:43:24.523748",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "config_full.yaml"
+    ],
+    "state": "running",
+    "program": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py",
+    "codePathLocal": "run_sft.py",
+    "codePath": "run_sft.py",
+    "git": {
+        "remote": "https://huggingface.co/sanchit-gandhi/distil-zephyr-1.5b-ssft-ultrachat",
+        "commit": "cbea69c6b95c970317a1e47c3f614b55b33f8ed9"
+    },
+    "email": null,
+    "root": "/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat",
+    "host": "ip-26-0-162-233",
+    "username": "sanchit",
+    "executable": "/fsx/sanchit/miniconda3/envs/venv/bin/python",
+    "cpu_count": 96,
+    "cpu_count_logical": 96,
+    "cpu_freq": {
+        "current": 2729.8387291666663,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3598.161,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3584.12,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3598.175,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3598.329,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3596.81,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3598.102,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3596.611,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 3598.198,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2650.0,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 290.7472343444824,
+            "used": 59.25613021850586
+        }
+    },
+    "gpu": "NVIDIA H100 80GB HBM3",
+    "gpu_count": 8,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        },
+        {
+            "name": "NVIDIA H100 80GB HBM3",
+            "memory_total": 85520809984
+        }
+    ],
+    "memory": {
+        "total": 1999.9855270385742
+    }
+}

wandb/run-20240424_164324-xfbnm7qo/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/loss": 2.0215, "train/grad_norm": 4.125, "train/learning_rate": 9.987179487179488e-05, "train/epoch": 0.13, "train/global_step": 525, "_timestamp": 1713977997.0387745, "_runtime": 992.4967684745789, "_step": 21}

wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240424_164324-xfbnm7qo/logs/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2024-04-24 16:43:24,533 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Current SDK version is 0.16.1
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Configure stats pid to 1854033
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /admin/home/sanchit/.config/wandb/settings
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/settings
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_sft.py', 'program_abspath': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py', 'program': '/fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/run_sft.py'}
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:_log_setup():524] Logging user logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug.log
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:_log_setup():525] Logging internal logs to /fsx/sanchit/distil-zephyr-1.5b-ssft-ultrachat/wandb/run-20240424_164324-xfbnm7qo/logs/debug-internal.log
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:init():564] calling init triggers
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
+config: {}
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:init():614] starting backend
+2024-04-24 16:43:24,534 INFO    MainThread:1854033 [wandb_init.py:init():618] setting up manager
+2024-04-24 16:43:24,537 INFO    MainThread:1854033 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-04-24 16:43:24,541 INFO    MainThread:1854033 [wandb_init.py:init():624] backend started and connected
+2024-04-24 16:43:24,544 INFO    MainThread:1854033 [wandb_init.py:init():716] updated telemetry
+2024-04-24 16:43:24,569 INFO    MainThread:1854033 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
+2024-04-24 16:43:24,850 INFO    MainThread:1854033 [wandb_run.py:_on_init():2254] communicating current version
+2024-04-24 16:43:24,896 INFO    MainThread:1854033 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-04-24 16:43:24,896 INFO    MainThread:1854033 [wandb_init.py:init():800] starting run threads in backend
+2024-04-24 16:43:30,532 INFO    MainThread:1854033 [wandb_run.py:_console_start():2233] atexit reg
+2024-04-24 16:43:30,532 INFO    MainThread:1854033 [wandb_run.py:_redirect():2088] redirect: wrap_raw
+2024-04-24 16:43:30,532 INFO    MainThread:1854033 [wandb_run.py:_redirect():2153] Wrapping output streams.
+2024-04-24 16:43:30,532 INFO    MainThread:1854033 [wandb_run.py:_redirect():2178] Redirects installed.
+2024-04-24 16:43:30,533 INFO    MainThread:1854033 [wandb_init.py:init():841] run started, returning control to user process
+2024-04-24 16:43:30,535 INFO    MainThread:1854033 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 6, 'num_attention_heads': 32, 'sliding_window': 4096, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'sanchit-gandhi/Mistral-7B-v0.1-6-layer', 'transformers_version': '4.40.0.dev0', 'model_type': 'mistral', 'output_dir': './', 'overwrite_output_dir': True, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 20000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Apr24_16-42-31_ip-26-0-162-233', 'logging_strategy': 'steps', 'logging_first_step': True, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 5000, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 5000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 7200, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'max_seq_length': 2048}

wandb/run-20240424_164324-xfbnm7qo/run-xfbnm7qo.wandb ADDED Viewed

Binary file (297 kB). View file