laurencer
/

Llama7b-Alpaca-Tune-4epochs-WithReplacementColoring-partial

Model card Files Files and versions Community

laurencer commited on Feb 11

Commit

c50cb21

•

1 Parent(s): 16c750e

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +1 -0
.gitignore +214 -0
README.md +49 -0
basic_config.yaml +35 -0
custom_dataset.py +179 -0
custom_model.py +211 -0
custom_params.py +106 -0
full_finetune.py +502 -0
masked_apply.py +50 -0
wandb/run-20240211_140449-81tescpe/files/config.yaml +33 -0
wandb/run-20240211_140449-81tescpe/files/requirements.txt +181 -0
wandb/run-20240211_140449-81tescpe/files/wandb-metadata.json +691 -0
wandb/run-20240211_140449-81tescpe/files/wandb-summary.json +1 -0
wandb/run-20240211_140449-81tescpe/run-81tescpe.wandb +0 -0
wandb/run-20240211_141255-f3ffr2e5/files/config.yaml +33 -0
wandb/run-20240211_141255-f3ffr2e5/files/requirements.txt +181 -0
wandb/run-20240211_141255-f3ffr2e5/files/wandb-metadata.json +691 -0
wandb/run-20240211_141255-f3ffr2e5/files/wandb-summary.json +1 -0
wandb/run-20240211_141255-f3ffr2e5/run-f3ffr2e5.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/run-20240211_141255-f3ffr2e5/run-f3ffr2e5.wandb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,214 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python,macos
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos
+### TorchTune ###
+output/
+model/
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+# End of https://www.toptal.com/developers/gitignore/api/python,macos

README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# torchtune research repo: token coloring (colorful llama)
+Playground to try out [token coloring](https://docs.google.com/document/d/1Win9vhddD-pu5P3SsG7E-dzN5oQl5DYWW1DhO7sBOgI/edit#heading=h.oqq00pt8expe) with TorchTune.
+The repo was generated using the alpha version of [torchtune](https://github.com/pytorch-labs/torchtune).
+Brief notes:
+- The starting recipe is based on the Alpaca Llama2 7B full finetune recipe (switched to bf16).
+- I copied a lot of functionality (like the actual model definition, dataset, etc) from torchtune repository directly since I needed to make changes.
+- I reduced the flexiblity of the recipe (e.g. cannot specify the model or tokenizer) and increased it in other ways (e.g. can pass in a dataset path directly).
+- I added intermediate checkpointing (i.e. every `n` steps) and automatically upload the checkpoint to HuggingFace Hub.
+- Assumes `output/` is used to store model outputs and `model/` is used to store the base model checkpoints.
+## Getting started
+The below instructions can be copy-pasted as is on to a running instance. They assume that the `HF_TOKEN` environment variable is set with a valid token.
+```bash
+# for RunPod
+cd /workspace
+git clone git@github.com:pytorch-labs/torchtune.git
+cd torchtune
+pip install -e .
+cd /workspace
+git clone git@github.com:laurencer/torchtune-colorful-llama.git
+cd torchtune-colorful-llama
+# for wandb support
+pip install wandb
+```
+```bash
+mkdir -p model/
+tune download --repo-id meta-llama/Llama-2-7b --output-dir model/
+```
+```bash
+tune convert_checkpoint --checkpoint-path model/consolidated.00.pth --output-path model/llama2_native.tune
+```
+```bash
+mkdir -p output/
+# tune --nnodes 1 --nproc_per_node 1 ./full_finetune.py --config basic_config.yaml
+nohup tune --nnodes 1 --nproc_per_node 1 ./full_finetune.py --config basic_config.yaml  2>&1 > training_log_$(date "+%Y.%m.%d_%H.%M.%S").log &
+sleep 1
+tail -f training_log_*.log
+```

basic_config.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Runs the full_finetune.py recipe
+#
+# To launch, run the following command from root:
+#    tune --nnodes 1 --nproc_per_node 1 --config alpaca_llama2_full_finetune --override model_checkpoint=<your_checkpoint_dir> ...
+# Dataset and Dataloader
+dataset: yahma/alpaca-cleaned
+seed: 42
+shuffle: True
+# Checkpointing
+# Removed for now given poor upload speeds for checkpoints
+# hf_repo_id: laurencer/Llama7b-Alpaca-Tune-4epochs-WithColoring
+checkpoint_every_n_steps: 5000 # 25k steps per epoch
+# Model Arguments
+model_checkpoint: model/llama2_native.tune
+tokenizer_checkpoint: model/tokenizer.model
+# Fine-tuning arguments
+batch_size: 2
+lr: 2e-5
+epochs: 4
+optimizer: SGD
+loss: CrossEntropyLoss
+output_dir: output/alpaca-llama2-finetune
+device: cuda
+dtype: fp16
+enable_fsdp: False
+enable_activation_checkpointing: True
+resume_from_checkpoint: False
+# Logging arguments
+metric_logger_type: wandb
+project: torchtune

custom_dataset.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Tuple
+import torch
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+from datasets import load_dataset
+# Not ideal to import this type here but it's needed for the transform function
+from torchtune.modules import Tokenizer
+CROSS_ENTROPY_IGNORE_IDX = -100
+DEFAULT = 0
+INSTRUCTION = 1
+INPUT = 2
+RESPONSE = 3
+class ColoringAlpacaDataset(Dataset):
+    """
+    See torchtune.datasets.alpaca.AlpacaDataset for the original implementation.
+    Constructor now takes in a dataset path directly.
+    This implementation returns 3 lists representing the tokens, labels, and token colors
+    (as opposed to just the tokens & labels from the original).
+    """
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        dataset_path: str = "yahma/alpaca-cleaned",
+        train_on_input: bool = True,
+        **kwargs
+    ) -> None:
+        self._data = load_dataset(dataset_path, split="train")
+        self._tokenizer = tokenizer
+        self.train_on_input = train_on_input
+        self.num_colors = 4 # matches the above usage of DEFAULT, INSTRUCTION, INPUT, RESPONSE
+    def __len__(self):
+        return len(self._data)
+    def __getitem__(self, index: int) -> Tuple[List[int], List[int], List[int]]:
+        sample = self._data[index]
+        return self._transform(
+            instruction=sample["instruction"],
+            input=sample["input"],
+            output=sample["output"],
+        )
+    def _transform(
+        self, instruction: str, input: str, output: str
+    ) -> Tuple[List[int], List[int], List[int]]:
+        """
+        Split a sample on ``response`` tag to create input and labels.
+        Args:
+            instruction (str): Instruction text.
+            input (str): Input text. Can be an empty string. Determines the prompt generation template
+                used.
+            output (str): Response text.
+        Returns:
+            Tuple of encoded inputs, labels, token colors.
+        """
+        prompt = self._generate_prompt(instruction, input)
+        # First handle the prompt
+        colors = []
+        tokenized = []
+        labels = []
+        is_first = True
+        for token_type, text in prompt:
+            tokenized_part = self._tokenizer.encode(
+                text=text, add_bos=is_first, add_eos=False
+            )
+            is_first = False
+            tokenized += tokenized_part
+            colors += [token_type] * len(tokenized_part)
+            if not self.train_on_input:
+                labels += [CROSS_ENTROPY_IGNORE_IDX] * len(tokenized_part)
+            else:
+                labels += tokenized_part
+        # Now add the response tokens
+        tokenized_part = self._tokenizer.encode(
+            text=output, add_bos=False, add_eos=True
+        )
+        tokenized += tokenized_part
+        colors += [RESPONSE] * len(tokenized_part)
+        labels += tokenized_part
+        assert len(tokenized) == len(labels)
+        assert len(tokenized) == len(colors)
+        return tokenized, labels, colors
+    def _generate_prompt(self, instruction: str, input: str) -> List[Tuple[(int, str)]]:
+        """
+        Generate prompt from instruction and input.
+        Args:
+            instruction (str): Instruction text.
+            input (str): Input text.
+        Returns:
+            List of (int, templated text)
+        """
+        if input:
+            return [
+                (DEFAULT, (
+                    "Below is an instruction that describes a task, paired with an input that provides further context. "
+                    "Write a response that appropriately completes the request.\n\n"
+                    "### Instruction:\n"
+                )),
+                (INSTRUCTION, instruction),
+                (DEFAULT, "\n\n### Input:\n"),
+                (INPUT, input),
+                (DEFAULT, "\n\n### Response:\n"),
+            ]
+        else:
+            return [
+                (DEFAULT, (
+                    "Below is an instruction that describes a task. "
+                    "Write a response that appropriately completes the request.\n\n"
+                    "### Instruction:\n"
+                )),
+                (INSTRUCTION, instruction),
+                (DEFAULT, "\n\n### Response:\n"),
+            ]
+# TokenPair is a pair (tuple) of three lists: tokenized text inputs, labels, colors.
+TokenPair = Tuple[List[int], List[int], List[int]]
+def padded_collate(
+    batch: List[TokenPair],
+    padding_idx: int = 0,
+    ignore_idx: int = -100,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    input_ids = pad_sequence(
+        [torch.tensor(x[0]) for x in batch],
+        batch_first=True,
+        padding_value=padding_idx,
+    )
+    labels = pad_sequence(
+        [torch.tensor(x[1]) for x in batch],
+        batch_first=True,
+        padding_value=ignore_idx,
+    )
+    colors = pad_sequence(
+        [torch.tensor(x[2]) for x in batch],
+        batch_first=True,
+        padding_value=ignore_idx,
+    )
+    input_ids_seq_len = input_ids.shape[-1]
+    labels_seq_len = labels.shape[-1]
+    colors_seq_len = colors.shape[-1]
+    assert input_ids_seq_len == labels_seq_len
+    assert input_ids_seq_len == colors_seq_len
+    return input_ids, labels, colors

custom_model.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from typing import Optional
+import torch
+from torch import nn, Tensor
+import copy
+from torchtune.modules import (
+    CausalSelfAttention,
+    FeedForward,
+    KVCache,
+    RMSNorm,
+    RotaryPositionalEmbeddings,
+    # TransformerDecoder, replaced with our custom implementation.
+    TransformerDecoderLayer,
+)
+from masked_apply import MaskedApply
+def _get_clones(module: nn.Module, n: int) -> nn.ModuleList:
+    """
+    Return a list of ``n`` identical layers.
+    Args:
+        module (nn.Module): module to be cloned
+        n (int): number of clones
+    Returns:
+        nn.ModuleList: list of ``n`` identical layers
+    """
+    # FIXME: copy.deepcopy() is not defined on nn.module
+    return nn.ModuleList([copy.deepcopy(module) for i in range(n)])
+class ColoringTransformerDecoder(nn.Module):
+    """
+    See torchtune.models.llama2.TransformerDecoder for the original implementation.
+    """
+    def __init__(
+        self,
+        tok_embeddings: nn.Embedding,
+        embedding_transform: nn.Module,
+        layer: TransformerDecoderLayer,
+        num_layers: int,
+        norm: nn.Module,
+        output: nn.Linear,
+    ) -> None:
+        super().__init__()
+        self.tok_embeddings = tok_embeddings
+        self.embedding_transform = embedding_transform
+        self.layers = _get_clones(layer, num_layers)
+        self.norm = norm
+        self.output = output
+    def forward(
+        self,
+        tokens: Tensor,
+        mask: Optional[Tensor] = None,
+        colors: Optional[Tensor] = None,
+        curr_pos: int = 0
+    ) -> Tensor:
+        """
+        Args:
+            tokens (Tensor): input tensor with shape [b x s]
+            mask (Optional[Tensor]): attention mask tensor, defaults to None.
+            curr_pos (int): current position in the seq, defaults to 0.
+                Only relevant when incrementally decoding.
+        Returns:
+            Tensor: output tensor with shape [b x s x v]
+        Notation used for tensor shapes:
+            - b: batch size
+            - s: sequence length
+            - v: vocab size
+            - d: embed dim
+        """
+        # input tensor of shape [b, s]
+        bsz, seq_len = tokens.shape
+        # shape: [b, s, d]
+        h = self.tok_embeddings(tokens)
+        h = self.embedding_transform(h, colors)
+        # TODO: Fix the masking logic to not rely on checking kv_cache
+        if seq_len > 1 and self.layers[0].attn.kv_cache is not None:
+            mask = torch.full(
+                (1, 1, seq_len, seq_len), float("-inf"), device=tokens.device
+            )
+            mask = torch.triu(mask, diagonal=curr_pos + 1)
+        for layer in self.layers:
+            # shape: [b, s, d]
+            h = layer(h, mask, curr_pos)
+        # shape: [b, s, d]
+        h = self.norm(h)
+        # shape: [b, s, v]
+        output = self.output(h).float()
+        return output
+def colouring_llama2_7b(max_batch_size: Optional[int] = None) -> ColoringTransformerDecoder:
+    """Builder for creating a Llama2 model initialized w/ the default 7b parameter values.
+    From https://arxiv.org/abs/2307.09288, these default values are:
+    - vocab_size: 32,000
+    - embed_dim: 4,096
+    - num_layers: 32
+    - num_heads: 32
+    - num_kv_heads: 32
+    - max_seq_len: 4,096
+    - norm_eps: 1e-5
+    Args:
+        max_batch_size (Optional[int]): Maximum batch size to be passed to KVCache.
+    Returns:
+        A ``TransformerDecoder`` instance of the Llama2 model.
+    """
+    return colouring_llama2(
+        vocab_size=32_000,
+        num_layers=32,
+        num_heads=32,
+        num_kv_heads=32,
+        embed_dim=4096,
+        max_seq_len=4096,
+        num_colors=4, # color for default, instruction, input, response
+        max_batch_size=max_batch_size,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+    )
+def _scale_hidden_dim_for_mlp(dim: int, multiple_of: int = 256) -> int:
+    """Scale hidden dimension for MLP to keep number of parameters and computation constant.
+    Args:
+        dim (int): Input dimension.
+        multiple_of (int): Round scaled dimension to nearest multiple of `multiple_of` for clean computation.
+    Returns:
+        Scaled hidden dimension.
+    """
+    # Scale hidden dimension by (2/3)4d for SwiGLU to keep number of
+    # parameters and computation constant
+    hidden_dim = 4 * int(2 * dim / 3)
+    # Round hidden dimension to nearest multiple of `multiple_of`
+    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+    return hidden_dim
+def colouring_llama2(
+    vocab_size: int,
+    num_layers: int,
+    num_heads: int,
+    num_kv_heads: int,
+    embed_dim: int,
+    max_seq_len: int,
+    num_colors: int,
+    attn_dropout: float = 0.0,
+    max_batch_size: Optional[int] = None,
+    norm_eps: float = 1e-5,
+):
+    head_dim = embed_dim // num_heads
+    num_kv_heads = num_kv_heads if num_kv_heads else num_heads
+    kv_cache = (
+        KVCache(
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_len,
+            n_kv_heads=num_heads,
+            head_dim=head_dim,
+        )
+        if max_batch_size is not None
+        else None
+    )
+    rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len)
+    self_attn = CausalSelfAttention(
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_dim,
+        q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False),
+        k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+        v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+        output_proj=nn.Linear(embed_dim, embed_dim, bias=False),
+        pos_embeddings=rope,
+        kv_cache=kv_cache,
+        max_seq_len=max_seq_len,
+        attn_dropout=attn_dropout,
+    )
+    hidden_dim = _scale_hidden_dim_for_mlp(embed_dim)
+    mlp = FeedForward(dim=embed_dim, hidden_dim=hidden_dim, linear_class=nn.Linear)
+    layer = TransformerDecoderLayer(
+        attn=self_attn,
+        mlp=mlp,
+        sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+        mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+    )
+    tok_embeddings = nn.Embedding(vocab_size, embed_dim)
+    output_proj = nn.Linear(embed_dim, vocab_size, bias=False)
+    return ColoringTransformerDecoder(
+        tok_embeddings=tok_embeddings,
+        embedding_transform=MaskedApply([nn.Linear(embed_dim, embed_dim) for _ in range(num_colors)]),
+        layer=layer,
+        num_layers=num_layers,
+        norm=RMSNorm(embed_dim, eps=norm_eps),
+        output=output_proj,
+    )

custom_params.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from dataclasses import dataclass, field, fields
+from typing import List, Optional
+from torchtune.datasets import ALL_DATASETS
+from torchtune.models import ALL_MODELS, ALL_TOKENIZERS
+from torchtune.utils.metric_logging import ALL_METRIC_LOGGERS
+from torchtune.utils.precision import PRECISION_STR_TO_DTYPE
+@dataclass
+class ColoringFinetuneParams:
+    """Arguments for the finetune_llm recipe.
+    Args:
+        device (str): Device to use for training. Options are "cpu" and "cuda"
+        dtype (str): Data type to use for training.
+        seed (int): Random seed to use for training.
+        model (str): String specifying model architecture to fine-tune. See ``torchtune.models.get_model`` for options.
+        model_checkpoint (str): Local path to load model checkpoint from.
+        tokenizer (str): String specifying tokenizer to use. See ``torchtune.models.get_tokenizer`` for options.
+        tokenizer_checkpoint (str): Local path to load tokenizer checkpoint from.
+        dataset (str): String specifying dataset to use. See ``torchtune.datasets.get_dataset`` for options.
+            Currently, only predefined datasets in library are supported.
+        shuffle (bool): Whether to shuffle dataset.
+        batch_size (int): Batch size to use for training.
+        epochs (int): Number of epochs to train for.
+        optimizer (str): String specifying optimizer to use. See ``torchtune.optim.get_optimizer`` for options.
+        loss (str): String specifying loss function to use. See ``torchtune.losses.get_loss`` for options.
+        lr (float): Learning rate to use for optimizer.
+        activation_checkpointing (bool): Whether to use activation checkpointing.
+        output_dir (str): Local path to save checkpoints and logs to.
+        run_generation (int): Run eval on a prompt every ``run_generation`` steps. Set to 0 to disable.
+        max_steps_per_epoch (int): Maximum number of steps to take per epoch.
+        metric_logger_type (str): String specifying metric logger to use. See ``torchtune.utils.get_metric_logger``
+            for options.
+        project (str): Project name to use for logging. Used by ``WandBLogger``.
+        resume_from_previous_checkpoint (bool): Whether to resume fine-tuning from a previous checkpoint.
+        cpu_offload (bool): Whether to offload model to CPU.
+    Raises:
+        ValueError: If ``cpu_offload`` is ``True`` but ``device`` is not ``cuda`` and <= 1 GPUs.
+    """
+    # Model
+    model_checkpoint: str = ""
+    # Tokenizer
+    tokenizer_checkpoint: str = ""
+    hf_repo_id: Optional[str] = None
+    checkpoint_every_n_steps: Optional[int] = None
+    # Dataset and Sampler
+    dataset: str = ""
+    train_on_input: bool = True
+    shuffle: bool = True
+    batch_size: int = 2
+    # Optimizer and Scheduler
+    optimizer: str = "SGD"
+    lr: float = 2e-5
+    loss: str = "CrossEntropyLoss"
+    gradient_accumulation_steps: int = 1
+    # Training
+    epochs: int = 3
+    max_steps_per_epoch: Optional[int] = None
+    resume_from_checkpoint: bool = False
+    run_generation: Optional[int] = None
+    # Distributed
+    cpu_offload: bool = False
+    enable_fsdp: bool = True
+    enable_activation_checkpointing: bool = True
+    # Environment
+    device: str = "cuda"
+    dtype: str = "fp16"
+    seed: Optional[int] = None
+    # Logging
+    output_dir: str = "/tmp/full_finetune_output"
+    metric_logger_type: str = "disk"
+    project: Optional[str] = None
+    log_every_n_steps: Optional[int] = None
+    def __post_init__(self):
+        for param in fields(self):
+            if getattr(self, param.name) == "":
+                raise TypeError(f"{param.name} needs to be specified")
+        if self.cpu_offload and self.device != "cuda":
+            raise ValueError(
+                "Cannot offload model to CPU if device is not cuda or <= 1 GPUs."
+            )
+        if self.enable_fsdp and self.device == "cpu":
+            raise ValueError("FSDP is not supported on CPU.")
+        if self.metric_logger_type not in ALL_METRIC_LOGGERS:
+            raise ValueError(
+                f"Metric logger not recognized. Expected one of {ALL_METRIC_LOGGERS}, received {self.metric_logger_type}."
+            )
+        if self.dtype not in PRECISION_STR_TO_DTYPE:
+            raise ValueError(
+                f"Dtype {self.dtype} must be one of {', '.join(PRECISION_STR_TO_DTYPE.keys())} for finetuning."
+            )

full_finetune.py ADDED Viewed

	@@ -0,0 +1,502 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import sys
+from functools import partial
+from typing import Any, Dict, Optional, Tuple
+from warnings import warn
+import torch
+from torch import nn
+from torch.cuda.amp import GradScaler
+from torch.distributed import init_process_group
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader, DistributedSampler
+from torchtune.utils import get_device
+from torchtune import models, modules, utils
+from torchtune.utils.constants import (
+    EPOCHS_KEY,
+    MAX_STEPS_KEY,
+    MODEL_KEY,
+    OPT_KEY,
+    SEED_KEY,
+    TOTAL_EPOCHS_KEY,
+)
+from tqdm import tqdm
+from recipes.interfaces import FTRecipeInterface
+from recipes.params import FullFinetuneParams
+from torchtune.models.llama2 import llama2_tokenizer
+from huggingface_hub import HfApi
+from custom_params import ColoringFinetuneParams
+from custom_model import ColoringTransformerDecoder, colouring_llama2_7b
+from custom_dataset import ColoringAlpacaDataset, padded_collate
+log = utils.get_logger("DEBUG")
+class ColoringFinetuneRecipe(FTRecipeInterface):
+    """
+    Full finetuning recipe for dense transformer-based LLMs such as Llama2.
+    This recipe supports:
+        - FSDP and activation checkpointing. This is enabled by default but can be
+            configured using the ``enable_fsdp`` and ``enable_activation_checkpointing`` flags.
+        - Mixed precision training - fp32, fp16 and bf16 are supported.
+        - Checkpointing of model weights, optimizer state and the recipe state (epoch and seed).
+        - Resuming from checkpoints saved using the ``save_checkpoint`` functionality.
+        - Logging to terminal. WandB and TensorBoard are currently not supported.
+    Assumptions:
+        - Training is launched with the Tune CLI (recommended) which uses TorchRun under the
+            hood. Setting up the env variables is handled by TorchRun.
+        - Training happens on CUDA (CPU training is not supported)
+        - Checkpoints are ONLY saved at epoch boundaries. Mid-epoch checkpointing is NOT supported.
+        - Datasets are Map-style and data fits in memory (not streamed).
+    """
+    _model: ColoringTransformerDecoder
+    def __init__(self, params: ColoringFinetuneParams) -> None:
+        self._device = utils.get_device(device=params.device)
+        self._dtype = utils.get_dtype(dtype=params.dtype)
+        self._hf_hub = HfApi()
+        self._hf_repo_id = params.hf_repo_id
+        if self._hf_repo_id is not None:
+            self._hf_hub.create_repo(
+                repo_id=self._hf_repo_id,
+                repo_type="model",
+                private=True,
+                exist_ok=True
+            )
+        # logging attributes
+        self._output_dir = params.output_dir
+        self._metric_logger = utils.get_metric_logger(
+            metric_logger_type=params.metric_logger_type,
+            project=params.project,
+            log_dir=params.output_dir,
+        )
+        self._log_every_n_steps = (
+            params.log_every_n_steps if params.log_every_n_steps else 1
+        )
+        self._checkpoint_every_n_steps = params.checkpoint_every_n_steps
+        # _is_rank_zero is used primarily for logging. In the future, the logger
+        # should directly take care of this
+        _, rank = utils.get_world_size_and_rank()
+        self._is_rank_zero = rank == 0
+        # Training params
+        self._resume_from_checkpoint = params.resume_from_checkpoint
+        self._enable_fsdp = params.enable_fsdp
+        self._gradient_accumulation_steps = params.gradient_accumulation_steps
+        # These are public properties which are updated by the checkpoint loader
+        # when ``resume_from_checkpoint`` is `True` or validated in tests
+        self.seed = utils.set_seed(seed=params.seed)
+        self.epochs_run = 0
+        self.total_epochs = params.epochs
+        self.max_steps_per_epoch = params.max_steps_per_epoch
+        self.total_training_steps = 0
+    def load_checkpoint(self, ckpt_path: str):
+        """
+        Extract the checkpoint state from file and validate.
+        """
+        ckpt_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        utils.validate_checkpoint(ckpt_dict, self._resume_from_checkpoint)
+        return ckpt_dict
+    def setup(self, params: FullFinetuneParams) -> None:
+        """
+        Sets up the recipe state correctly. This includes setting recipe attributes based
+        on the ``resume_from_checkpoint`` flag.
+        """
+        ckpt_dict = self.load_checkpoint(ckpt_path=params.model_checkpoint)
+        # If we're resuming from checkpoint, the recipe's state should be updated before
+        # initializing the training components. This ensures that the seed is correctly
+        # propagated to the relevant components
+        if self._resume_from_checkpoint:
+            self._update_recipe_state(ckpt_dict)
+        # ``_setup_model`` handles initialization and loading the state dict. This method
+        # should be called before ``_setup_optimizer`` since transforming the optimizer
+        # state dict requires the model
+        self._model = self._setup_model(
+            enable_fsdp=params.enable_fsdp,
+            enable_activation_checkpointing=params.enable_activation_checkpointing,
+            model_state_dict=ckpt_dict[MODEL_KEY],
+        )
+        self._tokenizer = self._setup_tokenizer(
+            tokenizer_checkpoint=params.tokenizer_checkpoint
+        )
+        # _setup_optimizer should take in ckpt_dict only if training is resumed from
+        # checkpoint. Transforming the opt state dict is handled by this method
+        self._optimizer = self._setup_optimizer(
+            optimizer=params.optimizer,
+            lr=params.lr,
+            opt_state_dict=ckpt_dict[OPT_KEY] if self._resume_from_checkpoint else None,
+        )
+        self._loss_fn = self._setup_loss(loss=params.loss)
+        # sampler and dataloader depend on the tokenizer and loss_fn and should be
+        # setup after both of these are initialized
+        self._sampler, self._dataloader = self._setup_data(
+            dataset=params.dataset,
+            train_on_input=params.train_on_input,
+            shuffle=params.shuffle,
+            batch_size=params.batch_size,
+        )
+        # training setup
+        self._autocast = utils.get_autocast(self._dtype, self._device)
+        self._grad_scaler = None
+        if self._dtype == torch.float16:
+            self._grad_scaler = utils.get_gradient_scaler(fsdp=params.enable_fsdp)
+        else:
+            self._grad_scaler = GradScaler(enabled=False)
+        # Finally update the recipe state which can only be correctly set after all of the
+        # other components have been initialized and updated.
+        #
+        # Number of training steps in each epoch depends on the number of batches produced
+        # by the dataloader, the max_steps_per_epoch param set by the user and the
+        # gradient_accumulation_steps param. This value is used for logging and tracking
+        # training state. The computation should happen after the dataloader has been setup
+        self._steps_per_epoch = (
+            len(self._dataloader) // self._gradient_accumulation_steps
+        )
+        if (
+            self.max_steps_per_epoch is not None
+            and self.max_steps_per_epoch < self._steps_per_epoch
+        ):
+            self._steps_per_epoch = self.max_steps_per_epoch
+        self.total_training_steps = self.epochs_run * self._steps_per_epoch
+    def _update_recipe_state(self, ckpt_dict: Dict[str, Any]) -> None:
+        """
+        Updates the recipe state from checkpoint.
+        """
+        # If seed, total_epoch or max_steps_per_epoch don't match,
+        # warn the user and overwrite
+        if (
+            self.seed != ckpt_dict[SEED_KEY]
+            or self.total_epochs != ckpt_dict[TOTAL_EPOCHS_KEY]
+            or self.max_steps_per_epoch != ckpt_dict[MAX_STEPS_KEY]
+        ):
+            warn(
+                message="""Configured value for seed, epochs or max_steps_per_epoch
+                does not match the value stored in checkpoint."""
+            )
+        self.seed = utils.set_seed(seed=ckpt_dict[SEED_KEY])
+        self.epochs_run = ckpt_dict[EPOCHS_KEY]
+        self.total_epochs = ckpt_dict[TOTAL_EPOCHS_KEY]
+        self.max_steps_per_epoch = ckpt_dict[MAX_STEPS_KEY]
+    def _setup_model(
+        self,
+        enable_fsdp: bool,
+        enable_activation_checkpointing: bool,
+        model_state_dict: Dict[str, Any],
+    ) -> nn.Module:
+        """
+        Set up the model including enabling FSDP and activation checkpointing. For this recipe,
+        ``enable_fsdp`` should always be ``True``. This is currently a configurable flag for
+        running tests on CPUs.
+        """
+        with get_device(self._device):
+            model = colouring_llama2_7b()
+        model = (
+            utils.wrap_fsdp(
+                model=model,
+                device=self._device,
+                dtype=self._dtype,
+                strategy="FULL_SHARD",
+                auto_wrap_policy={modules.TransformerDecoderLayer},
+            )
+            if enable_fsdp
+            else model
+        )
+        if enable_activation_checkpointing:
+            utils.set_activation_checkpointing(
+                model, auto_wrap_policy={modules.TransformerDecoderLayer}
+            )
+        model.load_state_dict(model_state_dict, strict=False)
+        if self._is_rank_zero:
+            log.info(
+                "Model is initialized. FSDP and Activation Checkpointing are enabled."
+            )
+        log.info("Compiling model")
+        model = torch.compile(model)
+        return model
+    def _setup_tokenizer(
+        self, tokenizer_checkpoint: str
+    ) -> modules.Tokenizer:
+        """
+        Unlike ```setup_model```, this takes in the checkpoint and loads the sentencepiece
+        tokenizer model. This is related to how the tokenizer is implemented and should
+        change in a future iteration.
+        """
+        tokenizer = llama2_tokenizer(tokenizer_checkpoint)
+        if self._is_rank_zero:
+            log.info("Tokenizer is initialized from file.")
+        return tokenizer
+    def _setup_optimizer(
+        self, optimizer: str, lr: float, opt_state_dict: Optional[Dict[str, Any]] = None
+    ) -> Optimizer:
+        """
+        Set up the optimizer. This method also handles transforing the state dict
+        for FSDP.
+        """
+        optimizer = modules.get_optimizer(optimizer, self._model, lr)
+        if opt_state_dict:
+            opt_state_dict = utils.transform_opt_state_dict(
+                opt_state_dict, self._model, optimizer
+            )
+            optimizer.load_state_dict(opt_state_dict)
+        if self._is_rank_zero:
+            log.info("Optimizer is initialized.")
+        return optimizer
+    def _setup_loss(self, loss: str) -> nn.Module:
+        loss_fn = modules.get_loss(loss)
+        if self._is_rank_zero:
+            log.info("Loss is initialized.")
+        return loss_fn
+    def _setup_data(
+        self, dataset: str, shuffle: bool, batch_size: int, train_on_input: bool
+    ) -> Tuple[DistributedSampler, DataLoader]:
+        """
+        All data related setup happens here. Currently this recipe only supports the
+        DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
+        iterable datasets and streaming datasets are not supported.
+        """
+        world_size, rank = utils.get_world_size_and_rank()
+        ds = ColoringAlpacaDataset(tokenizer=self._tokenizer, dataset=dataset, train_on_input=train_on_input)
+        sampler = DistributedSampler(
+            ds,
+            num_replicas=world_size,
+            rank=rank,
+            shuffle=shuffle,
+            seed=0,
+        )
+        dataloader = DataLoader(
+            dataset=ds,
+            batch_size=batch_size,
+            sampler=sampler,
+            collate_fn=partial(
+                padded_collate,
+                padding_idx=self._tokenizer.pad_id,
+                ignore_idx=self._loss_fn.ignore_index,  # TODO support loss without ignore_index
+            ),
+        )
+        if self._is_rank_zero:
+            log.info("Dataset and Sampler are initialized.")
+        return sampler, dataloader
+    def save_checkpoint(self, epoch: int) -> None:
+        """
+        Checkpoint the relevant state of a recipe.
+        This makes use of the `save_checkpoint` utility which is responsible for
+        writing the checkpoint dictionary to file. The contents of the dict are dictated
+        by whether training is complete or not.
+        If training is ongoing, optimizer state, seed and epochs_run are saved along with the
+        model weights.
+        """
+        os.makedirs(self._output_dir, exist_ok=True)
+        output_loc = f"{self._output_dir}/model_{epoch}.ckpt"
+        ckpt_dict = {MODEL_KEY: self._model}
+        # if training is in-progress, checkpoint the optimizer state as well
+        if epoch + 1 < self.total_epochs:
+            ckpt_dict.update(
+                {
+                    OPT_KEY: self._optimizer,
+                    SEED_KEY: self.seed,
+                    EPOCHS_KEY: self.epochs_run,
+                    TOTAL_EPOCHS_KEY: self.total_epochs,
+                    MAX_STEPS_KEY: self.max_steps_per_epoch,
+                }
+            )
+        utils.save_checkpoint(ckpt_dict, output_loc)
+        if self._is_rank_zero:
+            log.info(
+                f"Model checkpoint of size {os.path.getsize(output_loc) >> 20} MB saved to {output_loc}"
+            )
+            if self._hf_repo_id is not None:
+                log.info(f"Uploading checkpoint to HuggingFace Hub: {self._hf_repo_id}")
+                self._hf_hub.upload_folder(
+                    folder_path=self._output_dir,
+                    repo_id=self._hf_repo_id,
+                    repo_type="model",
+                    run_as_future=True,
+                    commit_message=f"Checkpoint for epoch {epoch} (step {self.total_training_steps})"
+                )
+            else:
+                log.info("Skipping uploading to HuggingFace Hub (no repo id specified)")
+    def _should_update_weights(self, curr_step: int) -> bool:
+        """
+        Determines whether the weights should be updated on the current step or not.
+        True is returned either if we've accumulated gradients for enough steps or if this
+        is the last step in the epoch.
+        """
+        should_update_weights = (
+            curr_step + 1
+        ) % self._gradient_accumulation_steps == 0 or (
+            curr_step + 1
+        ) == self._steps_per_epoch
+        return should_update_weights
+    def train(self) -> None:
+        """
+        The core training loop. Supports training on subsets of the dataset using the
+        ``max_steps_per_epoch``.
+        """
+        _, rank = utils.get_world_size_and_rank()
+        # zero out the gradients before starting training
+        self._optimizer.zero_grad()
+        # self.epochs_run should be non-zero when we're resuming from a checkpoint
+        for curr_epoch in range(self.epochs_run, self.total_epochs):
+            # Update the sampler to ensure data is correctly shuffled across epochs
+            # in case shuffle is True
+            self._sampler.set_epoch(curr_epoch)
+            for idx, batch in enumerate(
+                pbar := tqdm(self._dataloader, disable=not (rank == 0))
+            ):
+                if (
+                    self.max_steps_per_epoch is not None
+                    and (idx // self._gradient_accumulation_steps)
+                    == self.max_steps_per_epoch
+                ):
+                    break
+                input_ids, labels, colors = batch
+                input_ids = input_ids.to(self._device)
+                labels = labels.to(self._device)
+                colors = colors.to(self._device)
+                with self._autocast:
+                    logits = self._model(input_ids, colors=colors)
+                    # Shift so that tokens < n predict n
+                    logits = logits[..., :-1, :].contiguous()
+                    labels = labels[..., 1:].contiguous()
+                    logits = logits.transpose(1, 2)
+                    # Compute loss
+                    loss = self._loss_fn(logits, labels)
+                # Note: We're always logging the loss before normalizing it
+                # Check if this is the norm or not
+                pbar.set_description(f"{curr_epoch+1}|{idx+1}|Loss: {loss.item()}")
+                if self.total_training_steps % self._log_every_n_steps == 0:
+                    self._metric_logger.log_dict(
+                        {
+                            "loss": loss.item(),
+                            "lr": self._optimizer.param_groups[0]["lr"],
+                            "gpu_resources": torch.cuda.memory_allocated(),
+                        },
+                        step=self.total_training_steps,
+                    )
+                if self._checkpoint_every_n_steps is not None:
+                    if self.total_training_steps % self._checkpoint_every_n_steps == 0:
+                        self.save_checkpoint(epoch=curr_epoch)
+                # Does loss normalization need to happen within autocast context?
+                loss = loss / self._gradient_accumulation_steps
+                self._grad_scaler.scale(loss).backward()
+                if self._should_update_weights(idx):
+                    self._grad_scaler.step(self._optimizer)
+                    self._grad_scaler.update()
+                    self._optimizer.zero_grad(set_to_none=True)
+                    # Update the number of steps when the weights are updated
+                    self.total_training_steps += 1
+            self.epochs_run += 1
+            self.save_checkpoint(epoch=curr_epoch)
+    def cleanup(self) -> None:
+        self._metric_logger.close()
+def recipe_main() -> None:
+    """
+    Entry point for the recipe.
+    Configurable parameters are read in the following order:
+        - Parameters specified in ``ColoringFinetuneParams``
+        - Overwritten by Parameters specified in ``alpaca_llama2_full_finetune.yaml``
+        - Overwritten by arguments from the command-line using ``TuneArgumentParser``
+    """
+    parser = utils.TuneArgumentParser(
+        description=ColoringFinetuneParams.__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    args, _ = parser.parse_known_args()
+    args = vars(args)
+    recipe_params = ColoringFinetuneParams(**args)
+    # Env variables set by torch run; only need to initialize process group
+    # Disabled since this breaks for now on RunPod.
+    # init_process_group(backend="nccl")
+    recipe = ColoringFinetuneRecipe(params=recipe_params)
+    recipe.setup(params=recipe_params)
+    recipe.train()
+    recipe.cleanup()
+if __name__ == "__main__":
+    sys.exit(recipe_main())

masked_apply.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+class MaskedApply(nn.Module):
+    """
+    Uses an index mask to select a sbuset of the input and apply a layer to it.
+    E.g. if mask is [[0, 1, 0]] layers[0] will be applied to the first and third element
+    and layers[1] will be applied to the second element.
+    """
+    def __init__(self, layers):
+        super(MaskedApply, self).__init__()
+        self.num_layers = len(layers)
+        self.layers = nn.ModuleList(layers)
+    def forward(self, x, mask):
+        # Ensure mask is a long tensor
+        mask = mask.long()
+        # Flatten x and mask for easier processing
+        batch_size, seq_length, embedding_size = x.shape
+        x_flat = x.view(-1, embedding_size)
+        mask_flat = mask.view(-1)
+        # Output placeholder
+        output_flat = torch.zeros_like(x_flat)
+        # Process each mask value
+        for i in range(self.num_layers):
+            # Find indices for current mask value
+            indices = torch.where(mask_flat == i)[0]
+            # Select relevant inputs for the current linear layer
+            selected_inputs = torch.index_select(x_flat, 0, indices)
+            # Apply linear layer
+            transformed = self.layers[i](selected_inputs)
+            # TODO: figure out why this is necessary.
+            transformed = transformed.to(x_flat.dtype)
+            # Place results back in the output tensor
+            output_flat.index_copy_(0, indices, transformed)
+        # Reshape output to original dimensions
+        output = output_flat.view(batch_size, seq_length, embedding_size)
+        return output

wandb/run-20240211_140449-81tescpe/files/config.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+wandb_version: 1
+log_dir:
+  desc: null
+  value: output/alpaca-llama2-finetune
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: torch
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1707660289.367696
+    t:
+      1:
+      - 1
+      - 49
+      - 51
+      - 55
+      2:
+      - 1
+      - 49
+      - 51
+      - 55
+      3:
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      8:
+      - 5
+      13: linux-x86_64

wandb/run-20240211_140449-81tescpe/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,181 @@

+aiohttp==3.9.3
+aiosignal==1.3.1
+antlr4-python3-runtime==4.9.3
+anyio==4.2.0
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+async-timeout==4.0.3
+attrs==23.2.0
+babel==2.14.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blinker==1.4
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+comm==0.2.1
+cryptography==3.4.8
+datasets==2.15.0
+dbus-python==1.2.18
+debugpy==1.8.0
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.7
+distro==1.7.0
+docker-pycreds==0.4.0
+entrypoints==0.4
+exceptiongroup==1.2.0
+executing==2.0.1
+fastjsonschema==2.19.1
+filelock==3.13.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2023.10.0
+gitdb==4.0.11
+gitpython==3.1.41
+h11==0.14.0
+httpcore==1.0.2
+httplib2==0.20.2
+httpx==0.26.0
+huggingface-hub==0.19.4
+idna==3.6
+importlib-metadata==4.6.4
+ipykernel==6.29.0
+ipython-genutils==0.2.0
+ipython==8.21.0
+ipywidgets==8.1.1
+isoduration==20.11.0
+jedi==0.19.1
+jeepney==0.7.1
+jinja2==3.1.3
+json5==0.9.14
+jsonpointer==2.4
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-archive==3.4.0
+jupyter-client==7.4.9
+jupyter-contrib-core==0.4.2
+jupyter-contrib-nbextensions==0.7.0
+jupyter-core==5.7.1
+jupyter-events==0.9.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.2
+jupyter-nbextensions-configurator==0.6.3
+jupyter-server-terminals==0.5.2
+jupyter-server==2.12.5
+jupyterlab-pygments==0.3.0
+jupyterlab-server==2.25.2
+jupyterlab-widgets==3.0.9
+jupyterlab==4.1.0
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lxml==5.1.0
+markupsafe==2.1.5
+matplotlib-inline==0.1.6
+mistune==3.0.2
+more-itertools==8.10.0
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.15
+nbclassic==1.0.0
+nbclient==0.9.0
+nbconvert==7.14.2
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==3.2.1
+notebook-shim==0.2.3
+notebook==6.5.5
+numpy==1.26.3
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.0
+omegaconf==2.3.0
+overrides==7.7.0
+packaging==23.2
+pandas==2.2.0
+pandocfilters==1.5.1
+parso==0.8.3
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.25.2
+psutil==5.9.8
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==15.0.0
+pycparser==2.21
+pygments==2.17.2
+pygobject==3.42.1
+pyjwt==2.3.0
+pyparsing==2.4.7
+python-apt==2.4.0+ubuntu2
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytz==2024.1
+pyyaml==6.0.1
+pyzmq==24.0.1
+referencing==0.33.0
+requests==2.31.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.17.1
+secretstorage==3.3.1
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==1.40.3
+setproctitle==1.3.3
+setuptools==69.0.3
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+soupsieve==2.5
+stack-data==0.6.3
+sympy==1.12
+terminado==0.18.0
+tinycss2==1.2.1
+tomli==2.0.1
+torch==2.2.0
+torchaudio==2.2.0
+torchtune==0.0.1
+torchvision==0.17.0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.14.1
+triton==2.2.0
+types-python-dateutil==2.8.19.20240106
+typing-extensions==4.9.0
+tzdata==2023.4
+uri-template==1.3.0
+urllib3==2.2.0
+wadllib==1.3.6
+wandb==0.16.3
+wcwidth==0.2.13
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.7.0
+wheel==0.42.0
+widgetsnbextension==4.0.9
+xxhash==3.4.1
+yarl==1.9.4
+zipp==1.0.0

wandb/run-20240211_140449-81tescpe/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,691 @@

+{
+    "os": "Linux-5.4.0-169-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-02-11T14:04:50.615271",
+    "startedAt": "2024-02-11T14:04:49.324806",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--config",
+        "basic_config.yaml"
+    ],
+    "state": "running",
+    "program": "/workspace/torchtune-coloring/./full_finetune.py",
+    "codePathLocal": "full_finetune.py",
+    "codePath": "full_finetune.py",
+    "git": {
+        "remote": "git@github.com:laurencer/torchtune-colorful-llama.git",
+        "commit": "bce1cd9d7dc857040353558881688a78f4e8691b"
+    },
+    "email": null,
+    "root": "/workspace/torchtune-coloring",
+    "host": "513e57971672",
+    "username": "root",
+    "executable": "/usr/bin/python",
+    "cpu_count": 64,
+    "cpu_count_logical": 128,
+    "cpu_freq": {
+        "current": 1755.92525,
+        "min": 1500.0,
+        "max": 2800.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 1500.04,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.933,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.537,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.464,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.681,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.897,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.836,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.352,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2342.743,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2959.736,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1736.638,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2270.242,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.665,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.398,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.231,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.877,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.695,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.602,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.693,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.044,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1878.102,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2083.492,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2048.864,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2013.355,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2977.601,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 3724.526,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.262,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.431,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.174,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.507,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.878,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.719,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.341,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 3724.914,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2981.767,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2975.319,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1963.286,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1666.585,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2111.485,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2423.18,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.225,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.833,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.229,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.076,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1766.004,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1577.367,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1581.383,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1580.484,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.674,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.863,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.172,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.716,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.782,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.927,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.965,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.912,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.62,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.714,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.079,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.777,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.872,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.831,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.093,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.111,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.556,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.672,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.554,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.614,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1500.085,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.844,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.574,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.902,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2862.547,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 3409.479,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2926.343,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2321.842,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1495.763,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.724,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.288,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.339,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.647,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1496.864,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.035,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.227,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1867.397,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.385,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.957,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.714,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.319,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 3725.068,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.209,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2975.679,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.377,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.144,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.576,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.402,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.745,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 3725.174,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2980.206,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2978.411,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2072.139,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2094.813,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2050.315,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 3524.044,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.289,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1501.308,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.431,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.037,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.557,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.081,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.268,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.813,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.585,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.952,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1496.882,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.68,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.807,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.723,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.047,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.625,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1496.718,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.27,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.148,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.911,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.737,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.721,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1496.39,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1500.029,
+            "min": 1500.0,
+            "max": 2800.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 100.0,
+            "used": 13.073665618896484
+        }
+    },
+    "gpu": "NVIDIA A100 80GB PCIe",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100 80GB PCIe",
+            "memory_total": 85899345920
+        }
+    ],
+    "memory": {
+        "total": 1007.7841453552246
+    }
+}

wandb/run-20240211_140449-81tescpe/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"loss": 6.642009258270264, "lr": 2e-05, "gpu_resources": 28185677312, "_timestamp": 1707660723.104045, "_runtime": 433.7363488674164, "_step": 649, "_wandb": {"runtime": 433}}

wandb/run-20240211_140449-81tescpe/run-81tescpe.wandb ADDED Viewed

Binary file (499 kB). View file

wandb/run-20240211_141255-f3ffr2e5/files/config.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+wandb_version: 1
+log_dir:
+  desc: null
+  value: output/alpaca-llama2-finetune
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: torch
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1707660775.784475
+    t:
+      1:
+      - 1
+      - 49
+      - 51
+      - 55
+      2:
+      - 1
+      - 49
+      - 51
+      - 55
+      3:
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      8:
+      - 5
+      13: linux-x86_64

wandb/run-20240211_141255-f3ffr2e5/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,181 @@

+aiohttp==3.9.3
+aiosignal==1.3.1
+antlr4-python3-runtime==4.9.3
+anyio==4.2.0
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+async-timeout==4.0.3
+attrs==23.2.0
+babel==2.14.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blinker==1.4
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+comm==0.2.1
+cryptography==3.4.8
+datasets==2.15.0
+dbus-python==1.2.18
+debugpy==1.8.0
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.7
+distro==1.7.0
+docker-pycreds==0.4.0
+entrypoints==0.4
+exceptiongroup==1.2.0
+executing==2.0.1
+fastjsonschema==2.19.1
+filelock==3.13.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2023.10.0
+gitdb==4.0.11
+gitpython==3.1.41
+h11==0.14.0
+httpcore==1.0.2
+httplib2==0.20.2
+httpx==0.26.0
+huggingface-hub==0.19.4
+idna==3.6
+importlib-metadata==4.6.4
+ipykernel==6.29.0
+ipython-genutils==0.2.0
+ipython==8.21.0
+ipywidgets==8.1.1
+isoduration==20.11.0
+jedi==0.19.1
+jeepney==0.7.1
+jinja2==3.1.3
+json5==0.9.14
+jsonpointer==2.4
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-archive==3.4.0
+jupyter-client==7.4.9
+jupyter-contrib-core==0.4.2
+jupyter-contrib-nbextensions==0.7.0
+jupyter-core==5.7.1
+jupyter-events==0.9.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.2
+jupyter-nbextensions-configurator==0.6.3
+jupyter-server-terminals==0.5.2
+jupyter-server==2.12.5
+jupyterlab-pygments==0.3.0
+jupyterlab-server==2.25.2
+jupyterlab-widgets==3.0.9
+jupyterlab==4.1.0
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lxml==5.1.0
+markupsafe==2.1.5
+matplotlib-inline==0.1.6
+mistune==3.0.2
+more-itertools==8.10.0
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.15
+nbclassic==1.0.0
+nbclient==0.9.0
+nbconvert==7.14.2
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==3.2.1
+notebook-shim==0.2.3
+notebook==6.5.5
+numpy==1.26.3
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.0
+omegaconf==2.3.0
+overrides==7.7.0
+packaging==23.2
+pandas==2.2.0
+pandocfilters==1.5.1
+parso==0.8.3
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.25.2
+psutil==5.9.8
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==15.0.0
+pycparser==2.21
+pygments==2.17.2
+pygobject==3.42.1
+pyjwt==2.3.0
+pyparsing==2.4.7
+python-apt==2.4.0+ubuntu2
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytz==2024.1
+pyyaml==6.0.1
+pyzmq==24.0.1
+referencing==0.33.0
+requests==2.31.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.17.1
+secretstorage==3.3.1
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==1.40.3
+setproctitle==1.3.3
+setuptools==69.0.3
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+soupsieve==2.5
+stack-data==0.6.3
+sympy==1.12
+terminado==0.18.0
+tinycss2==1.2.1
+tomli==2.0.1
+torch==2.2.0
+torchaudio==2.2.0
+torchtune==0.0.1
+torchvision==0.17.0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.14.1
+triton==2.2.0
+types-python-dateutil==2.8.19.20240106
+typing-extensions==4.9.0
+tzdata==2023.4
+uri-template==1.3.0
+urllib3==2.2.0
+wadllib==1.3.6
+wandb==0.16.3
+wcwidth==0.2.13
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.7.0
+wheel==0.42.0
+widgetsnbextension==4.0.9
+xxhash==3.4.1
+yarl==1.9.4
+zipp==1.0.0

wandb/run-20240211_141255-f3ffr2e5/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,691 @@

+{
+    "os": "Linux-5.4.0-169-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-02-11T14:12:57.431913",
+    "startedAt": "2024-02-11T14:12:55.736045",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--config",
+        "basic_config.yaml"
+    ],
+    "state": "running",
+    "program": "/workspace/torchtune-coloring/./full_finetune.py",
+    "codePathLocal": "full_finetune.py",
+    "codePath": "full_finetune.py",
+    "git": {
+        "remote": "git@github.com:laurencer/torchtune-colorful-llama.git",
+        "commit": "bce1cd9d7dc857040353558881688a78f4e8691b"
+    },
+    "email": null,
+    "root": "/workspace/torchtune-coloring",
+    "host": "513e57971672",
+    "username": "root",
+    "executable": "/usr/bin/python",
+    "cpu_count": 64,
+    "cpu_count_logical": 128,
+    "cpu_freq": {
+        "current": 1584.06415625,
+        "min": 1500.0,
+        "max": 2800.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 1490.009,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.378,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.271,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.077,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1696.135,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1810.431,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1650.597,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1668.338,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.137,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.334,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.558,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1580.833,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.972,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.117,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1796.541,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.312,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2977.727,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.389,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2978.317,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 3695.755,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.377,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.215,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1494.35,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.254,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.02,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.747,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.785,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.015,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1526.811,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1566.368,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1701.151,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1507.923,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.608,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.845,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.249,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.128,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.649,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.034,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.386,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.641,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.814,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.542,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.895,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.555,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.328,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.571,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.412,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.382,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.44,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1495.766,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.108,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.73,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.463,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.523,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1500.13,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.545,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.452,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.325,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.653,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.635,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.506,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.004,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.265,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.955,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1331.298,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.548,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1384.617,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.803,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2003.768,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2386.047,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1670.529,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1680.364,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1496.711,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1496.734,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.113,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.733,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2185.862,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2139.21,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2640.006,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2195.686,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.829,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2979.073,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 2961.456,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 3723.45,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.311,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.576,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1493.545,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.524,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.523,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.226,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.089,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.806,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.455,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.626,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1500.045,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1496.146,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.683,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.746,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.509,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.5,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.181,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.949,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.742,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.275,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.657,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.18,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.544,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.82,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.69,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.346,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.574,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.708,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1495.929,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.447,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1496.645,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1495.605,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.426,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.76,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.735,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.099,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.845,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1499.781,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.862,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.535,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.513,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.411,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1497.487,
+            "min": 1500.0,
+            "max": 2800.0
+        },
+        {
+            "current": 1498.069,
+            "min": 1500.0,
+            "max": 2800.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 100.0,
+            "used": 13.073677062988281
+        }
+    },
+    "gpu": "NVIDIA A100 80GB PCIe",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100 80GB PCIe",
+            "memory_total": 85899345920
+        }
+    ],
+    "memory": {
+        "total": 1007.7841453552246
+    }
+}

wandb/run-20240211_141255-f3ffr2e5/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"loss": 5.02125883102417, "lr": 2e-05, "gpu_resources": 41688505856, "_timestamp": 1707690323.6159635, "_runtime": 29547.831488370895, "_step": 72713, "_wandb": {"runtime": 29548}}

wandb/run-20240211_141255-f3ffr2e5/run-f3ffr2e5.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f897ff6fe35e0befb48c9d12218b5443a432b2e464eca8e783351a8c84e92c8c
+size 65078304