ThomasTheMaker commited on 14 days ago

Commit

feba2ad

verified ·

1 Parent(s): 6557434

Upload folder using huggingface_hub

Browse files

Files changed (47) hide show

.env.example +2 -0
.gitignore +174 -0
.pre-commit-config.yaml +10 -0
LICENSE +201 -0
README.md +170 -0
configs/examples/demo.yaml +48 -0
configs/examples/pico-decoder-large.yaml +35 -0
configs/examples/pico-decoder-medium.yaml +35 -0
configs/examples/pico-decoder-small.yaml +35 -0
configs/examples/pico-decoder-tiny.yaml +35 -0
configs/pico-decoder-tiny-dolma10M-v1.yaml +78 -0
configs/pico-decoder-tiny-dolma20M-v1.yaml +78 -0
configs/pico-decoder-tiny-dolma5M-v1.yaml +78 -0
plots/.gitignore +74 -0
plots/404.html +33 -0
plots/README.md +90 -0
plots/code.js +550 -0
plots/data.json +0 -0
plots/index.html +72 -0
plots/style.css +258 -0
pyproject.toml +33 -0
scripts/README.md +109 -0
scripts/generate_data.py +198 -0
scripts/train.py +30 -0
setup.sh +200 -0
src/checkpointing/__init__.py +23 -0
src/checkpointing/evaluation.py +68 -0
src/checkpointing/learning_dynamics.py +424 -0
src/checkpointing/training.py +287 -0
src/config/__init__.py +31 -0
src/config/_constants.py +18 -0
src/config/checkpointing_config.py +97 -0
src/config/data_config.py +36 -0
src/config/evaluation_config.py +28 -0
src/config/model_config.py +33 -0
src/config/monitoring_config.py +29 -0
src/config/training_config.py +40 -0
src/evaluation/__init__.py +103 -0
src/evaluation/tasks/paloma.py +52 -0
src/model/__init__.py +12 -0
src/model/pico_decoder.py +911 -0
src/training/trainer.py +753 -0
src/training/utils/__init__.py +34 -0
src/training/utils/data.py +35 -0
src/training/utils/initialization.py +702 -0
src/training/utils/io.py +52 -0
src/training/utils/logging.py +48 -0

.env.example ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WANDB_API_KEY=your_wandb_key
2	+ HF_TOKEN=your_huggingface_token

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+poetry.lock
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Data
+data/
+# Checkpoint and Logging Directorries
+runs/
+wandb/
+# configs/
+.vscode/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.7.1
+  hooks:
+    # Run the linter.
+    - id: ruff
+      args: [ --fix, --extend-select, I ]
+    # Run the formatter.
+    - id: ruff-format

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,170 @@

+# 🚀 **Pico Train**
+Pico Train is a lightweight framework for training language models—from tiny-scale (~1M parameters) to mid-scale (~1B parameters)—with built-in rich checkpointing that captures activations, gradients, and model states, enabling detailed learning dynamics research.
+Our **suite of pre-trained models** is already publicly available on our [Hugging Face organization](https://huggingface.co/pico-lm), and a dedicated companion library for advanced analysis—[**pico-analyze**](https://github.com/pico-lm/pico-analyze)—is fully released for deeper checkpoint studies.
+> For a **detailed run-through**, check out the **full tutorial** on our website at [picolm.io](https://picolm.io).
+---
+## **Key Features**
+1. **Pico Decoder: LLAMA-style Transformer Architecture**
+   - RMSNorm, RoPE, multi-head self-attention with KV-cache, and SwiGLU activations
+   - Currently supports the **pico-decoder** model, with future expansions planned (pico-diffusion, pico-statespace, etc.)
+2. **Comprehensive Checkpoints**
+   - Saves model states, optimizer states, and training metadata
+   - Enriched with **activation and gradient** snapshots for interpretability
+3. **Focused Scale Range**
+   - Optimized to train models from **1M to 1B parameters**, where learning dynamics research is most viable
+4. **Clean, Pre-tokenized Data**
+   - Uses a pre-tokenized, pre-shuffled version of [Dolma](https://allenai.org/dolma) that we make available on [Hugging Face](https://huggingface.co/datasets/pico-lm/pretokenized-dolma)
+   - Facilitates training models using identical data for **consistency** and **comparability**
+6. **Research Ready**
+   - Minimal, well-documented code suitable for **forking and tailoring**
+   - Logs essential metrics (e.g. perplexity) throughout training
+   - Works seamlessly with [pico-analyze](https://github.com/pico-lm/pico-analyze) for advanced post-training interpretation
+---
+## **Training Philosophy**
+All models in the Pico suite (both pre-trained and user-trained):
+- Employ **identical architectures** and **optimizer settings**
+- **Share** the same data order and tokens
+- Automatically log **rich checkpoint data** (including activations, gradients)
+- Facilitate **direct cross-scale comparisons**
+This uniformity means you can isolate model size as the primary variable, giving you clearer insights into **how model capacity affects learning**.
+---
+## **Resources**
+- **Pre-trained Models** (1M–1B parameters), publicly hosted on [Hugging Face](https://huggingface.co/pico-lm)
+- **Pre-tokenized Datasets** for straightforward streaming-based training
+- **Extensive Checkpoints** logging activation and gradient snapshots
+- **Evaluation Metrics** (perplexity and more) tracked at each checkpoint
+---
+## **Core Components**
+- **Pico-Decoder Model**
+  - LLAMA-style auto-regressive transformer
+  - RMSNorm
+  - RoPE (Rotary Positional Embeddings)
+  - Multi-head attention with KV-cache
+  - SwiGLU activation
+  *Future plans include additional architectures like pico-diffusion and pico-statespace.*
+- **Training & Checkpointing**
+  - Automatic storage of model and optimizer states
+  - Periodic hooks for saving **learning dynamics** (activations, gradients)
+  - Optional logging to Weights & Biases
+- **Config-Driven Setup**
+  - Specify architecture, optimizer, dataset, and logging settings in YAML
+  - Straightforward to extend or modify
+---
+## **Quick Start**
+1. **Clone the Repository**
+   ```bash
+   git clone https://github.com/pico-lm/pico-train
+   cd pico-train
+   ```
+2. **Configure Environment**
+   Create a `.env` file at the root with your Hugging Face and Weights & Biases tokens:
+   ```bash
+   export HF_TOKEN=your_huggingface_token
+   export WANDB_API_KEY=your_wandb_key
+   ```
+3. **Install Dependencies**
+   ```bash
+   source setup.sh
+   ```
+   This script checks your environment, installs necessary tools, and sets up a Poetry virtual environment.
+4. **Train Your Model Suite**
+   - Edit (or create) a config file (e.g., `configs/demo.yaml`) to specify your architecture and training preferences.
+   - Then run:
+     ```bash
+     poetry run train --config_path configs/demo.yaml
+     ```
+   - This launches training, automatically checkpointing states and saving learning dynamics data.
+5. **Explore Checkpoints**
+   - By default, checkpoints are stored under `runs/YOUR_RUN_NAME/checkpoints/`.
+   - Each checkpoint contains:
+     - **Model state** (PyTorch + Hugging Face formats)
+     - **Optimizer state**
+     - **Gradients and activations** for interpretability
+     - **Evaluation logs** (e.g. perplexity) and metrics
+---
+## **Repository Structure**
+- **`src/model/pico_decoder.py`**
+  - Core LLAMA-style decoder implementation (attention, RMSNorm, RoPE, etc.)
+- **`src/training/trainer.py`**
+  - Main training loop
+  - Manages distributed and multi-node settings
+  - Collects/logs metrics
+  - Orchestrates checkpoint saving
+- **`src/checkpointing`**
+  - Logic for saving model states, gradients, activations
+  - Tools for uploading checkpoints to Hugging Face
+- **`src/config`**
+  - Flexible Dataclass-based config system (model and training hyperparameters, checkpointing, logging)
+- **`configs/demo.yaml`**
+  - Example config with default values for quick experimentation
+---
+## **Advanced Analysis with Pico Analyze**
+For deeper checkpoint analysis—comparing gradients, tracking representation shifts, measuring sparsity—use our companion repository [**pico-analyze**](https://github.com/pico-lm/pico-analyze). It automatically processes **pico-train** checkpoints and applies advanced metrics like **CKA**, **PWCCA**, **Gini**, **Hoyer**, and more to reveal **how** your models learn over time.
+---
+## **License**
+Pico is open-source under the [Apache License 2.0](LICENSE).
+---
+## **Citation**
+If you use **Pico** in your research, please cite:
+```bibtex
+@software{pico2025,
+    author = {Diehl Martinez, Richard},
+    title = {Pico: A Lightweight Framework for Studying Language Model Learning Dynamics},
+    year = {2025},
+    url = {https://github.com/pico-lm}
+}
+```
+**Happy Training!** For more information and tutorials, visit our website at [picolm.io](https://picolm.io).

configs/examples/demo.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Demo config file
+# You can follow this template to create your own config file
+# Refer to the config files in the configs/ directory to see all the available options
+data:
+  dataloader:
+    batch_size: 32
+checkpointing:
+  run_name: "pico-decoder-demo-1"
+  save_every_n_steps: 50
+  save_to_hf: true
+  hf_checkpoint:
+    repo_id: "pico-lm/demo"
+  learning_dynamics:
+    batch_size: 16
+model:
+    d_model: 96
+    activation_hidden_dim: 384
+evaluation:
+  paloma:
+    batch_size: 32
+monitoring:
+  save_to_wandb: true
+  wandb:
+    project: "pico-demo"
+    entity: "pico-lm"
+  logging:
+    log_every_n_steps: 10
+training:
+  max_steps: 100
+  optimization:
+    lr: 0.001
+    lr_warmup_steps: 30
+    gradient_accumulation_steps: 2
+  fabric:
+    num_devices: 1

configs/examples/pico-decoder-large.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Demo config file
+# You can follow this template to create your own config file
+# Refer to the config files in the configs/ directory to see all the available options
+checkpointing:
+  run_name: "pico-decoder-large-1"
+  save_to_hf: true
+  hf_checkpoint:
+    repo_id: "pico-lm/pico-decoder-large"
+  learning_dynamics:
+    batch_size: 128
+model:
+    d_model: 1536
+    activation_hidden_dim: 6144
+monitoring:
+  save_to_wandb: true
+  wandb:
+    project: "pico-decoder"
+    entity: "pico-lm"
+training:
+  optimization:
+    gradient_accumulation_steps: 8
+  fabric:
+    num_nodes: 4
+    num_devices: 4
+evaluation:
+  paloma:
+    batch_size: 16

configs/examples/pico-decoder-medium.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Demo config file
+# You can follow this template to create your own config file
+# Refer to the config files in the configs/ directory to see all the available options
+checkpointing:
+  run_name: "pico-decoder-medium-1"
+  save_to_hf: true
+  hf_checkpoint:
+    repo_id: "pico-lm/pico-decoder-medium"
+  learning_dynamics:
+    batch_size: 128
+model:
+    d_model: 768
+    activation_hidden_dim: 3072
+monitoring:
+  save_to_wandb: true
+  wandb:
+    project: "pico-decoder"
+    entity: "pico-lm"
+training:
+  optimization:
+    gradient_accumulation_steps: 8
+  fabric:
+    num_nodes: 4
+    num_devices: 4
+evaluation:
+  paloma:
+    batch_size: 16

configs/examples/pico-decoder-small.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Demo config file
+# You can follow this template to create your own config file
+# Refer to the config files in the configs/ directory to see all the available options
+checkpointing:
+  run_name: "pico-decoder-small-1"
+  save_to_hf: true
+  hf_checkpoint:
+    repo_id: "pico-lm/pico-decoder-small"
+  learning_dynamics:
+    batch_size: 128
+model:
+    d_model: 384
+    activation_hidden_dim: 1536
+monitoring:
+  save_to_wandb: true
+  wandb:
+    project: "pico-decoder"
+    entity: "pico-lm"
+training:
+  optimization:
+    gradient_accumulation_steps: 8
+  fabric:
+    num_nodes: 4
+    num_devices: 4
+evaluation:
+  paloma:
+    batch_size: 16

configs/examples/pico-decoder-tiny.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Demo config file
+# You can follow this template to create your own config file
+# Refer to the config files in the configs/ directory to see all the available options
+checkpointing:
+  run_name: "pico-decoder-tiny-1"
+  save_to_hf: true
+  hf_checkpoint:
+    repo_id: "pico-lm/pico-decoder-tiny"
+  learning_dynamics:
+    batch_size: 256
+model:
+    d_model: 96
+    activation_hidden_dim: 384
+monitoring:
+  save_to_wandb: true
+  wandb:
+    project: "pico-decoder"
+    entity: "pico-lm"
+training:
+  optimization:
+    gradient_accumulation_steps: 4
+  fabric:
+    num_nodes: 4
+    num_devices: 4
+evaluation:
+  paloma:
+    batch_size: 32

configs/pico-decoder-tiny-dolma10M-v1.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# High Quality Training Config - Optimized for H100 80GB Performance
+# Fast training configuration maintaining identical model quality
+# Optimized for H100 80GB with maximum throughput while preserving stability
+# Updated for efficient training on Dolma 10M tokens with H100-optimized hyperparameters
+checkpointing:
+  run_name: "pico-decoder-tiny-dolma10M-v1"
+  save_to_hf: true
+  hf_checkpoint:
+    repo_id: "ThomasTheMaker/pico-decoder-tiny"
+  save_every_n_steps: 2000  # Reduced checkpoint frequency for faster training
+  learning_dynamics:
+    batch_size: 1  # Minimal batch size for learning dynamics
+    eval_data: null  # Disable learning dynamics to save memory
+model:
+    d_model: 96
+    activation_hidden_dim: 384
+    dropout: 0.15  # Increased dropout for stronger regularization
+    attention_dropout: 0.15  # Increased attention dropout
+    layer_norm_eps: 1e-5  # Tighter normalization for stability
+    weight_init_type: "truncated_normal"  # Truncated normal for stability
+    layer_norm_type: "rms_norm"  # RMSNorm for better stability
+    use_qk_norm: true  # Query-Key normalization for attention stability
+monitoring:
+  save_to_wandb: false
+  wandb:
+    project: "pico-decoder-tiny"
+    entity: "boymyc"
+  logging:
+    log_every_n_steps: 100  # Reduced logging frequency for faster training
+training:
+  max_steps: 100000  # Longer training for better convergence
+  optimization:
+    lr: 0.0002  # Scaled learning rate for larger batch size (4x increase)
+    lr_warmup_steps: 2000  # Reduced warmup for faster convergence
+    lr_scheduler: "cosine"  # Cosine decay over full dataset for sustained learning
+    weight_decay: 0.02  # Increased weight decay for stronger regularization
+    max_grad_norm: 0.5  # Tighter gradient clipping for stability
+    gradient_accumulation_steps: 1  # Reduced for faster training with larger batches
+    optimizer: "adamw"
+    adam_beta1: 0.9  # Standard AdamW beta1
+    adam_beta2: 0.999  # Standard AdamW beta2
+    adam_epsilon: 1e-8  # Tighter epsilon for numerical stability and convergence
+  fabric:
+    num_nodes: 1
+    num_devices: 1
+    precision: "bf16-mixed"  # BF16 for Tensor Core optimization
+evaluation:
+  paloma:
+    batch_size: 1  # Minimal evaluation batch size
+    eval_every_n_steps: 1000  # Reduced evaluation frequency for faster training
+data:
+  dataset:
+    name: "ThomasTheMaker/pretokenized-dolma-10M"  # Updated to 5M token dataset
+  dataloader:
+    batch_size: 16  # Conservative H100 optimization - 4x larger for stable fast training
+  tokenizer:
+    name: "allenai/OLMo-7B-0724-hf"
+    vocab_size: 50304
+# H100-optimized training strategy for fast, memory-safe training:
+# 1. Conservative batch size (16) with scaled learning rate (0.0002) for stable H100 utilization
+# 2. Reduced gradient accumulation (1 step) for faster optimization cycles
+# 3. Shorter warmup (2000 steps) for quicker convergence with larger batches
+# 4. Reduced evaluation frequency (1000 steps) to minimize training interruptions
+# 5. Reduced checkpoint/logging frequency to minimize I/O overhead
+# 6. Same model architecture and regularization for identical final performance
+# 7. Expected 4-6x training speedup while maintaining model quality and memory safety
+# 8. Memory usage: ~15-25GB of 80GB H100 VRAM (safe utilization avoiding OOM)
+# 9. Maintains all stability features: RMSNorm, QK-Norm, dropout, weight decay
+# 10. Same convergence quality with significant speedup and no memory issues

configs/pico-decoder-tiny-dolma20M-v1.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# High Quality Training Config - Optimized for H100 80GB Performance
+# Fast training configuration maintaining identical model quality
+# Optimized for H100 80GB with maximum throughput while preserving stability
+# Updated for efficient training on Dolma 10M tokens with H100-optimized hyperparameters
+checkpointing:
+  run_name: "pico-decoder-tiny-dolma20M-v1"
+  save_to_hf: false
+  hf_checkpoint:
+    repo_id: "ThomasTheMaker/pico-decoder-tiny"
+  save_every_n_steps: 1000  # Reduced checkpoint frequency for faster training
+  learning_dynamics:
+    batch_size: 1  # Minimal batch size for learning dynamics
+    eval_data: null  # Disable learning dynamics to save memory
+model:
+    d_model: 96
+    activation_hidden_dim: 384
+    dropout: 0.15  # Increased dropout for stronger regularization
+    attention_dropout: 0.15  # Increased attention dropout
+    layer_norm_eps: 1e-5  # Tighter normalization for stability
+    weight_init_type: "truncated_normal"  # Truncated normal for stability
+    layer_norm_type: "rms_norm"  # RMSNorm for better stability
+    use_qk_norm: true  # Query-Key normalization for attention stability
+monitoring:
+  save_to_wandb: false
+  wandb:
+    project: "pico-decoder-tiny"
+    entity: "boymyc"
+  logging:
+    log_every_n_steps: 100  # Reduced logging frequency for faster training
+training:
+  max_steps: 100000  # Longer training for better convergence
+  optimization:
+    lr: 0.0002  # Scaled learning rate for larger batch size (4x increase)
+    lr_warmup_steps: 2000  # Reduced warmup for faster convergence
+    lr_scheduler: "cosine"  # Cosine decay over full dataset for sustained learning
+    weight_decay: 0.02  # Increased weight decay for stronger regularization
+    max_grad_norm: 0.5  # Tighter gradient clipping for stability
+    gradient_accumulation_steps: 1  # Reduced for faster training with larger batches
+    optimizer: "adamw"
+    adam_beta1: 0.9  # Standard AdamW beta1
+    adam_beta2: 0.999  # Standard AdamW beta2
+    adam_epsilon: 1e-8  # Tighter epsilon for numerical stability and convergence
+  fabric:
+    num_nodes: 1
+    num_devices: 1
+    precision: "bf16-mixed"  # BF16 for Tensor Core optimization
+evaluation:
+  paloma:
+    batch_size: 1  # Minimal evaluation batch size
+    eval_every_n_steps: 1000  # Reduced evaluation frequency for faster training
+data:
+  dataset:
+    name: "ThomasTheMaker/pretokenized-dolma-20M"  # Updated to 5M token dataset
+  dataloader:
+    batch_size: 16  # Conservative H100 optimization - 4x larger for stable fast training
+  tokenizer:
+    name: "allenai/OLMo-7B-0724-hf"
+    vocab_size: 50304
+# H100-optimized training strategy for fast, memory-safe training:
+# 1. Conservative batch size (16) with scaled learning rate (0.0002) for stable H100 utilization
+# 2. Reduced gradient accumulation (1 step) for faster optimization cycles
+# 3. Shorter warmup (2000 steps) for quicker convergence with larger batches
+# 4. Reduced evaluation frequency (1000 steps) to minimize training interruptions
+# 5. Reduced checkpoint/logging frequency to minimize I/O overhead
+# 6. Same model architecture and regularization for identical final performance
+# 7. Expected 4-6x training speedup while maintaining model quality and memory safety
+# 8. Memory usage: ~15-25GB of 80GB H100 VRAM (safe utilization avoiding OOM)
+# 9. Maintains all stability features: RMSNorm, QK-Norm, dropout, weight decay
+# 10. Same convergence quality with significant speedup and no memory issues

configs/pico-decoder-tiny-dolma5M-v1.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# High Quality Training Config - Optimized for superior model performance
+# This configuration prioritizes model quality over training speed
+# Designed for RTX 5090 with focus on preventing overfitting and maximizing generalization
+# Updated for scaling training on Dolma 5M tokens with stability-focused hyperparameters
+checkpointing:
+  run_name: "pico-decoder-tiny-dolma5M-v1"
+  save_to_hf: true
+  hf_checkpoint:
+    repo_id: "ThomasTheMaker/pico-decoder-tiny"
+  save_every_n_steps: 500  # Frequent checkpoints for quality monitoring
+  learning_dynamics:
+    batch_size: 1  # Minimal batch size for learning dynamics
+    eval_data: null  # Disable learning dynamics to save memory
+model:
+    d_model: 96
+    activation_hidden_dim: 384
+    dropout: 0.15  # Increased dropout for stronger regularization
+    attention_dropout: 0.15  # Increased attention dropout
+    layer_norm_eps: 1e-5  # Tighter normalization for stability
+    weight_init_type: "truncated_normal"  # Truncated normal for stability
+    layer_norm_type: "rms_norm"  # RMSNorm for better stability
+    use_qk_norm: true  # Query-Key normalization for attention stability
+monitoring:
+  save_to_wandb: false
+  wandb:
+    project: "pico-decoder-tiny"
+    entity: "boymyc"
+  logging:
+    log_every_n_steps: 25  # Very frequent logging for quality monitoring
+training:
+  max_steps: 100000  # Longer training for better convergence
+  optimization:
+    lr: 0.00005  # Even lower learning rate for precision training
+    lr_warmup_steps: 8000  # Extended warmup for stability
+    lr_scheduler: "cosine"  # Cosine decay over full dataset for sustained learning
+    weight_decay: 0.02  # Increased weight decay for stronger regularization
+    max_grad_norm: 0.5  # Tighter gradient clipping for stability
+    gradient_accumulation_steps: 4  # Increased for better gradient estimates
+    optimizer: "adamw"
+    adam_beta1: 0.9  # Standard AdamW beta1
+    adam_beta2: 0.999  # Standard AdamW beta2
+    adam_epsilon: 1e-8  # Tighter epsilon for numerical stability and convergence
+  fabric:
+    num_nodes: 1
+    num_devices: 1
+    precision: "bf16-mixed"  # BF16 for Tensor Core optimization
+evaluation:
+  paloma:
+    batch_size: 1  # Minimal evaluation batch size
+    eval_every_n_steps: 250  # Very frequent evaluation for quality monitoring
+data:
+  dataset:
+    name: "ThomasTheMaker/pretokenized-dolma-5M"  # Updated to 5M token dataset
+  dataloader:
+    batch_size: 4  # Reduced for more stable training
+  tokenizer:
+    name: "allenai/OLMo-7B-0724-hf"
+    vocab_size: 50304
+# Stability-focused training strategy for large-scale Dolma training:
+# 1. Cosine learning rate schedule for sustained learning over full dataset
+# 2. Truncated normal weight initialization to prevent extreme outliers
+# 3. RMSNorm for better gradient stability during long training runs
+# 4. Query-Key normalization (QK-Norm) to prevent attention logit overflow
+# 5. AdamW epsilon 1e-8 for improved training stability and convergence
+# 6. Extended warmup (8000 steps) for stable foundation
+# 7. Stronger regularization (dropout 0.15, weight decay 0.02)
+# 8. Tighter gradient clipping (0.5) for stability
+# 9. More frequent evaluation (every 250 steps) for quality monitoring
+# 10. Longer training (40000 steps) for full convergence on 5M tokens

plots/.gitignore ADDED Viewed

	@@ -0,0 +1,74 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+firebase-debug.log*
+firebase-debug.*.log*
+# Firebase cache
+.firebase/
+# Firebase config
+# Uncomment this if you'd like others to create their own Firebase project.
+# For a team working on the same Firebase project(s), it is recommended to leave
+# it commented so all members can deploy to the same project(s) in .firebaserc.
+# .firebaserc
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+# Coverage directory used by tools like istanbul
+coverage
+# nyc test coverage
+.nyc_output
+# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+# Bower dependency directory (https://bower.io/)
+bower_components
+# node-waf configuration
+.lock-wscript
+# Compiled binary addons (http://nodejs.org/api/addons.html)
+build/Release
+# Dependency directories
+node_modules/
+# Optional npm cache directory
+.npm
+# Optional eslint cache
+.eslintcache
+# Optional REPL history
+.node_repl_history
+# Output of 'npm pack'
+*.tgz
+# Yarn Integrity file
+.yarn-integrity
+# dotenv environment variables file
+.env
+# dataconnect generated files
+.dataconnect
+# firebase files
+.firebaserc
+firebase.json

plots/404.html ADDED Viewed

	@@ -0,0 +1,33 @@

+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Page Not Found</title>
+    <style media="screen">
+      body { background: #ECEFF1; color: rgba(0,0,0,0.87); font-family: Roboto, Helvetica, Arial, sans-serif; margin: 0; padding: 0; }
+      #message { background: white; max-width: 360px; margin: 100px auto 16px; padding: 32px 24px 16px; border-radius: 3px; }
+      #message h3 { color: #888; font-weight: normal; font-size: 16px; margin: 16px 0 12px; }
+      #message h2 { color: #ffa100; font-weight: bold; font-size: 16px; margin: 0 0 8px; }
+      #message h1 { font-size: 22px; font-weight: 300; color: rgba(0,0,0,0.6); margin: 0 0 16px;}
+      #message p { line-height: 140%; margin: 16px 0 24px; font-size: 14px; }
+      #message a { display: block; text-align: center; background: #039be5; text-transform: uppercase; text-decoration: none; color: white; padding: 16px; border-radius: 4px; }
+      #message, #message a { box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24); }
+      #load { color: rgba(0,0,0,0.4); text-align: center; font-size: 13px; }
+      @media (max-width: 600px) {
+        body, #message { margin-top: 0; background: white; box-shadow: none; }
+        body { border-top: 16px solid #ffa100; }
+      }
+    </style>
+  </head>
+  <body>
+    <div id="message">
+      <h2>404</h2>
+      <h1>Page Not Found</h1>
+      <p>The specified file was not found on this website. Please check the URL for mistakes and try again.</p>
+      <h3>Why am I seeing this?</h3>
+      <p>This page was generated by the Firebase Command-Line Interface. To modify it, edit the <code>404.html</code> file in your project's configured <code>public</code> directory.</p>
+    </div>
+  </body>
+</html>

plots/README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# 🚀 Pico Training Metrics Dashboard
+A beautiful, interactive web dashboard for visualizing training progress across all your Pico model runs.
+## ✨ Features
+- **📈 Training Loss Visualization**: Track loss curves over time for all runs
+- **🎯 Learning Rate Schedules**: Monitor LR progression and warmup patterns
+- **📊 Paloma Evaluation**: View perplexity metrics during training
+- **🔄 Combined View**: See all metrics together for easy comparison
+- **🎨 Interactive Charts**: Built with Chart.js for smooth interactions
+- **📱 Responsive Design**: Works on desktop and mobile devices
+- **⚙️ Run Comparison**: Compare different model configurations side-by-side
+## �� Quick Start
+1. **Generate Data**: First, run the data generation script to parse your training logs:
+   ```bash
+   python scripts/generate_data.py
+   ```
+2. **View the Dashboard**: Open `index.html` in your web browser
+3. **Select Runs**: Use the dropdown to view specific runs or all runs together
+4. **Toggle Metrics**: Check/uncheck boxes to show/hide different metric types
+5. **Explore Charts**: Hover over data points for detailed information
+## 📁 Files
+- `index.html` - Main dashboard interface
+- `style.css` - Modern, responsive styling
+- `code.js` - Interactive chart functionality
+- `data.json` - Training metrics data (auto-generated from logs)
+## 🔧 Data Source
+The dashboard automatically extracts training metrics from:
+- Training loss at each step
+- Learning rate progression
+- Paloma evaluation results
+- Model configuration parameters
+## 🔄 Updating Data
+To refresh the dashboard with new training data:
+1. **Run new training sessions** - logs will be saved to `runs/*/logs/`
+2. **Generate updated data.json**:
+   ```bash
+   python scripts/generate_data.py
+   ```
+3. **Refresh the dashboard** - new runs will appear automatically
+## 🎨 Chart Types
+1. **Training Loss**: Line charts showing loss reduction over time
+2. **Learning Rate**: Logarithmic scale for LR schedule visualization
+3. **Evaluation**: Paloma perplexity metrics during training
+4. **Combined**: All metrics on one chart for easy comparison
+## 💡 Usage Tips
+- **Compare Runs**: Select "All Runs" to see how different configurations perform
+- **Zoom In**: Use the chart zoom features to focus on specific training phases
+- **Export**: Right-click charts to save as images
+- **Mobile**: Dashboard is fully responsive for mobile devices
+## 🎯 Key Metrics Tracked
+- **Training Loss**: Primary performance indicator
+- **Learning Rate**: Schedule adherence and warmup progress
+- **Paloma Perplexity**: Model evaluation quality
+- **Inf/NaN Counts**: Training stability monitoring
+- **Model Config**: Architecture and hyperparameter details
+## 🌟 Design Features
+- **Modern UI**: Clean, professional interface
+- **Color Coding**: Distinct colors for each model run
+- **Responsive Layout**: Adapts to different screen sizes
+- **Interactive Elements**: Hover effects and smooth animations
+- **Professional Typography**: Easy-to-read fonts and spacing
+## 📚 Documentation
+For more details on generating the data.json file, see:
+- `scripts/README.md` - Complete script documentation
+- `scripts/generate_data.py` - The data generation script
+---
+Built with ❤️ for the Pico Language Model training community

plots/code.js ADDED Viewed

	@@ -0,0 +1,550 @@

+// Global variables
+let trainingData = null;
+let charts = {};
+// Color palette for different runs
+const colors = [
+    '#667eea', '#764ba2', '#f093fb', '#f5576c', '#4facfe', '#00f2fe',
+    '#43e97b', '#38f9d7', '#fa7093', '#fee140', '#a8edea', '#fed6e3'
+];
+// Initialize the dashboard
+document.addEventListener('DOMContentLoaded', function() {
+    loadData();
+    setupEventListeners();
+});
+// Load training data from JSON file
+async function loadData() {
+    try {
+        const response = await fetch('data.json');
+        trainingData = await response.json();
+        // Merge continuation logs from the same model run
+        mergeContinuationLogs();
+        populateRunSelector();
+        createCharts();
+        updateRunSummary();
+        updateConfigDetails();
+        console.log('Data loaded and merged successfully:', trainingData);
+    } catch (error) {
+        console.error('Error loading data:', error);
+        document.body.innerHTML = '<div class="loading">Error loading training data. Please check the console for details.</div>';
+    }
+}
+// Merge continuation logs from the same model run
+function mergeContinuationLogs() {
+    const runGroups = {};
+    // Group runs by base model name
+    trainingData.runs.forEach(run => {
+        const baseName = run.run_name;
+        if (!runGroups[baseName]) {
+            runGroups[baseName] = [];
+        }
+        runGroups[baseName].push(run);
+    });
+    // Merge runs with the same base name
+    const mergedRuns = [];
+    Object.entries(runGroups).forEach(([baseName, runs]) => {
+        if (runs.length === 1) {
+            // Single run, no merging needed
+            mergedRuns.push(runs[0]);
+        } else {
+            // Multiple runs to merge
+            console.log(`Merging ${runs.length} continuation logs for ${baseName}`);
+            const mergedRun = {
+                run_name: baseName,
+                log_files: runs.map(r => r.log_file),
+                training_metrics: [],
+                evaluation_results: [],
+                config: runs[0].config || {}
+            };
+            // Merge training metrics (they should be continuous)
+            runs.forEach(run => {
+                if (run.training_metrics) {
+                    mergedRun.training_metrics.push(...run.training_metrics);
+                }
+            });
+            // Merge evaluation results (they should be continuous)
+            runs.forEach(run => {
+                if (run.evaluation_results) {
+                    mergedRun.evaluation_results.push(...run.evaluation_results);
+                }
+            });
+            // Sort by step number to ensure proper ordering
+            mergedRun.training_metrics.sort((a, b) => a.step - b.step);
+            mergedRun.evaluation_results.sort((a, b) => a.step - b.step);
+            // Remove duplicates based on step number
+            mergedRun.training_metrics = mergedRun.training_metrics.filter((metric, index, self) =>
+                index === 0 || metric.step !== self[index - 1].step
+            );
+            mergedRun.evaluation_results = mergedRun.evaluation_results.filter((result, index, self) =>
+                index === 0 || result.step !== self[index - 1].step
+            );
+            console.log(`Merged ${baseName}: ${mergedRun.training_metrics.length} training points, ${mergedRun.evaluation_results.length} eval points`);
+            mergedRuns.push(mergedRun);
+        }
+    });
+    trainingData.runs = mergedRuns;
+}
+// Setup event listeners for controls
+function setupEventListeners() {
+    document.getElementById('runSelect').addEventListener('change', function() {
+        updateCharts();
+        updateRunSummary();
+        updateConfigDetails();
+    });
+    document.getElementById('showTraining').addEventListener('change', updateCharts);
+    document.getElementById('showLearningRate').addEventListener('change', updateCharts);
+    document.getElementById('showEvaluation').addEventListener('change', updateCharts);
+}
+// Populate run selector dropdown
+function populateRunSelector() {
+    const select = document.getElementById('runSelect');
+    const runs = trainingData.runs;
+    // Clear existing options
+    select.innerHTML = '<option value="all">All Runs</option>';
+    runs.forEach((run, index) => {
+        const option = document.createElement('option');
+        option.value = index;
+        option.textContent = run.run_name;
+        select.appendChild(option);
+    });
+}
+// Create all charts
+function createCharts() {
+    createLossChart();
+    createLRChart();
+    createEvalChart();
+    createCombinedChart();
+}
+// Create training loss chart
+function createLossChart() {
+    const ctx = document.getElementById('lossChart').getContext('2d');
+    charts.loss = new Chart(ctx, {
+        type: 'line',
+        data: getChartData('loss'),
+        options: {
+            responsive: true,
+            maintainAspectRatio: false,
+            plugins: {
+                title: {
+                    display: true,
+                    text: 'Training Loss Over Time'
+                },
+                legend: {
+                    position: 'top'
+                }
+            },
+            scales: {
+                x: {
+                    type: 'linear',
+                    title: {
+                        display: true,
+                        text: 'Training Step'
+                    }
+                },
+                y: {
+                    title: {
+                        display: true,
+                        text: 'Loss'
+                    },
+                    beginAtZero: false
+                }
+            },
+            interaction: {
+                intersect: false,
+                mode: 'index'
+            }
+        }
+    });
+}
+// Create learning rate chart
+function createLRChart() {
+    const ctx = document.getElementById('lrChart').getContext('2d');
+    charts.lr = new Chart(ctx, {
+        type: 'line',
+        data: getChartData('lr'),
+        options: {
+            responsive: true,
+            maintainAspectRatio: false,
+            plugins: {
+                title: {
+                    display: true,
+                    text: 'Learning Rate Schedule'
+                },
+                legend: {
+                    position: 'top'
+                }
+            },
+            scales: {
+                x: {
+                    type: 'linear',
+                    title: {
+                        display: true,
+                        text: 'Training Step'
+                    }
+                },
+                y: {
+                    title: {
+                        display: true,
+                        text: 'Learning Rate'
+                    },
+                    type: 'logarithmic'
+                }
+            },
+            interaction: {
+                intersect: false,
+                mode: 'index'
+            }
+        }
+    });
+}
+// Create evaluation chart
+function createEvalChart() {
+    const ctx = document.getElementById('evalChart').getContext('2d');
+    charts.eval = new Chart(ctx, {
+        type: 'line',
+        data: getChartData('eval'),
+        options: {
+            responsive: true,
+            maintainAspectRatio: false,
+            plugins: {
+                title: {
+                    display: true,
+                    text: 'Paloma Evaluation Metrics'
+                },
+                legend: {
+                    position: 'top'
+                }
+            },
+            scales: {
+                x: {
+                    type: 'linear',
+                    title: {
+                        display: true,
+                        text: 'Training Step'
+                    }
+                },
+                y: {
+                    title: {
+                        display: true,
+                        text: 'Perplexity'
+                    },
+                    type: 'logarithmic'
+                }
+            },
+            interaction: {
+                intersect: false,
+                mode: 'index'
+            }
+        }
+    });
+}
+// Create combined chart
+function createCombinedChart() {
+    const ctx = document.getElementById('combinedChart').getContext('2d');
+    charts.combined = new Chart(ctx, {
+        type: 'line',
+        data: getCombinedChartData(),
+        options: {
+            responsive: true,
+            maintainAspectRatio: false,
+            plugins: {
+                title: {
+                    display: true,
+                    text: 'Combined Training Metrics'
+                },
+                legend: {
+                    position: 'top'
+                }
+            },
+            scales: {
+                x: {
+                    type: 'linear',
+                    title: {
+                        display: true,
+                        text: 'Training Step'
+                    }
+                },
+                y: {
+                    title: {
+                        display: true,
+                        text: 'Value'
+                    }
+                }
+            },
+            interaction: {
+                intersect: false,
+                mode: 'index'
+            }
+        }
+    });
+}
+// Get chart data for specific metric type
+function getChartData(metricType) {
+    const selectedRun = document.getElementById('runSelect').value;
+    const runs = selectedRun === 'all' ? trainingData.runs : [trainingData.runs[selectedRun]];
+    const datasets = [];
+    console.log(`Getting ${metricType} data for ${runs.length} runs:`, runs.map(r => r.run_name));
+    runs.forEach((run, runIndex) => {
+        const color = colors[runIndex % colors.length];
+        if (metricType === 'loss') {
+            if (run.training_metrics && run.training_metrics.length > 0) {
+                const data = run.training_metrics.map(m => ({ x: m.step, y: m.loss }));
+                console.log(`Loss data for ${run.run_name}:`, data.slice(0, 5), '...', data.slice(-5));
+                datasets.push({
+                    label: run.run_name,
+                    data: data,
+                    borderColor: color,
+                    backgroundColor: color + '20',
+                    borderWidth: 2,
+                    fill: false,
+                    tension: 0.1
+                });
+            }
+        } else if (metricType === 'lr') {
+            if (run.training_metrics && run.training_metrics.length > 0) {
+                const data = run.training_metrics.map(m => ({ x: m.step, y: m.learning_rate }));
+                console.log(`LR data for ${run.run_name}:`, data.slice(0, 5), '...', data.slice(-5));
+                datasets.push({
+                    label: run.run_name,
+                    data: data,
+                    borderColor: color,
+                    backgroundColor: color + '20',
+                    borderWidth: 2,
+                    fill: false,
+                    tension: 0.1
+                });
+            }
+        } else if (metricType === 'eval') {
+            if (run.evaluation_results && run.evaluation_results.length > 0) {
+                const data = run.evaluation_results.map(m => ({ x: m.step, y: m.paloma }));
+                console.log(`Eval data for ${run.run_name}:`, data.slice(0, 5), '...', data.slice(-5));
+                datasets.push({
+                    label: run.run_name,
+                    data: data,
+                    borderColor: color,
+                    backgroundColor: color + '20',
+                    borderWidth: 2,
+                    fill: false,
+                    tension: 0.1
+                });
+            }
+        }
+    });
+    console.log(`Final ${metricType} datasets:`, datasets);
+    return { datasets };
+}
+// Get combined chart data
+function getCombinedChartData() {
+    const selectedRun = document.getElementById('runSelect').value;
+    const runs = selectedRun === 'all' ? trainingData.runs : [trainingData.runs[selectedRun]];
+    const datasets = [];
+    runs.forEach((run, runIndex) => {
+        const color = colors[runIndex % colors.length];
+        // Training loss
+        if (run.training_metrics && run.training_metrics.length > 0) {
+            datasets.push({
+                label: `${run.run_name} - Loss`,
+                data: run.training_metrics.map(m => ({ x: m.step, y: m.loss })),
+                borderColor: color,
+                backgroundColor: color + '20',
+                borderWidth: 2,
+                fill: false,
+                tension: 0.1
+            });
+        }
+        // Learning rate (scaled)
+        if (run.training_metrics && run.training_metrics.length > 0) {
+            const maxLR = Math.max(...run.training_metrics.map(m => m.learning_rate));
+            const maxLoss = Math.max(...run.training_metrics.map(m => m.loss));
+            const scaleFactor = maxLoss / maxLR;
+            datasets.push({
+                label: `${run.run_name} - LR (scaled)`,
+                data: run.training_metrics.map(m => ({ x: m.step, y: m.learning_rate * scaleFactor })),
+                borderColor: color + '80',
+                backgroundColor: color + '10',
+                borderWidth: 1,
+                fill: false,
+                tension: 0.1
+            });
+        }
+    });
+    return { datasets };
+}
+// Update all charts based on current selection
+function updateCharts() {
+    if (charts.loss) {
+        charts.loss.data = getChartData('loss');
+        charts.loss.update();
+    }
+    if (charts.lr) {
+        charts.lr.data = getChartData('lr');
+        charts.lr.update();
+    }
+    if (charts.eval) {
+        charts.eval.data = getChartData('eval');
+        charts.eval.update();
+    }
+    if (charts.combined) {
+        charts.combined.data = getCombinedChartData();
+        charts.combined.update();
+    }
+}
+// Update run summary section
+function updateRunSummary() {
+    const container = document.getElementById('runSummary');
+    const selectedRun = document.getElementById('runSelect').value;
+    const runs = selectedRun === 'all' ? trainingData.runs : [trainingData.runs[selectedRun]];
+    let html = '<div class="run-grid">';
+    runs.forEach(run => {
+        const trainingPoints = run.training_metrics ? run.training_metrics.length : 0;
+        const evalPoints = run.evaluation_results ? run.evaluation_results.length : 0;
+        let finalLoss = 'N/A';
+        let finalLR = 'N/A';
+        let finalPaloma = 'N/A';
+        let stepRange = 'N/A';
+        if (run.training_metrics && run.training_metrics.length > 0) {
+            const first = run.training_metrics[0];
+            const last = run.training_metrics[run.training_metrics.length - 1];
+            finalLoss = last.loss.toFixed(4);
+            finalLR = last.learning_rate.toExponential(2);
+            stepRange = `${first.step} → ${last.step}`;
+        }
+        if (run.evaluation_results && run.evaluation_results.length > 0) {
+            const last = run.evaluation_results[run.evaluation_results.length - 1];
+            if (isFinite(last.paloma)) {
+                finalPaloma = last.paloma.toExponential(2);
+            } else {
+                finalPaloma = '∞';
+            }
+        }
+        const logFiles = run.log_files ? run.log_files.join(', ') : run.log_file;
+        html += `
+            <div class="run-card">
+                <h4>${run.run_name}</h4>
+                <p><strong>Logs:</strong> ${logFiles}</p>
+                <div class="metric">
+                    <span>Step Range:</span>
+                    <span class="value">${stepRange}</span>
+                </div>
+                <div class="metric">
+                    <span>Training Points:</span>
+                    <span class="value">${trainingPoints}</span>
+                </div>
+                <div class="metric">
+                    <span>Evaluation Points:</span>
+                    <span class="value">${evalPoints}</span>
+                </div>
+                <div class="metric">
+                    <span>Final Loss:</span>
+                    <span class="value">${finalLoss}</span>
+                </div>
+                <div class="metric">
+                    <span>Final LR:</span>
+                    <span class="value">${finalLR}</span>
+                </div>
+                <div class="metric">
+                    <span>Final Paloma:</span>
+                    <span class="value">${finalPaloma}</span>
+                </div>
+            </div>
+        `;
+    });
+    html += '</div>';
+    container.innerHTML = html;
+}
+// Update configuration details section
+function updateConfigDetails() {
+    const container = document.getElementById('configDetails');
+    const selectedRun = document.getElementById('runSelect').value;
+    const runs = selectedRun === 'all' ? trainingData.runs : [trainingData.runs[selectedRun]];
+    let html = '<div class="config-grid">';
+    // Get unique config keys
+    const allKeys = new Set();
+    runs.forEach(run => {
+        if (run.config) {
+            Object.keys(run.config).forEach(key => allKeys.add(key));
+        }
+    });
+    allKeys.forEach(key => {
+        const values = runs.map(run => run.config && run.config[key] !== undefined ? run.config[key] : 'N/A');
+        const uniqueValues = [...new Set(values)];
+        const displayValue = uniqueValues.length === 1 ? uniqueValues[0] : `${uniqueValues.join(' / ')}`;
+        html += `
+            <div class="config-item">
+                <div class="label">${key.replace(/_/g, ' ').toUpperCase()}</div>
+                <div class="value">${displayValue}</div>
+            </div>
+        `;
+    });
+    html += '</div>';
+    container.innerHTML = html;
+}
+// Utility function to format large numbers
+function formatNumber(num) {
+    if (num >= 1e9) return (num / 1e9).toFixed(2) + 'B';
+    if (num >= 1e6) return (num / 1e6).toFixed(2) + 'M';
+    if (num >= 1e3) return (num / 1e3).toFixed(2) + 'K';
+    return num.toString();
+}

plots/data.json ADDED Viewed

The diff for this file is too large to render. See raw diff

plots/index.html ADDED Viewed

	@@ -0,0 +1,72 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Pico Training Metrics Dashboard</title>
+    <link rel="stylesheet" href="style.css">
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>🚀 Pico Training Metrics Dashboard</h1>
+            <p>Real-time visualization of training progress across all model runs</p>
+        </header>
+        <div class="controls">
+            <div class="run-selector">
+                <label for="runSelect">Select Run:</label>
+                <select id="runSelect">
+                    <option value="all">All Runs</option>
+                </select>
+            </div>
+            <div class="metric-toggle">
+                <label>
+                    <input type="checkbox" id="showTraining" checked> Training Loss
+                </label>
+                <label>
+                    <input type="checkbox" id="showLearningRate" checked> Learning Rate
+                </label>
+                <label>
+                    <input type="checkbox" id="showEvaluation" checked> Paloma Evaluation
+                </label>
+            </div>
+        </div>
+        <div class="charts-container">
+            <div class="chart-card">
+                <h3>📈 Training Loss Over Time</h3>
+                <canvas id="lossChart"></canvas>
+            </div>
+            <div class="chart-card">
+                <h3>🎯 Learning Rate Schedule</h3>
+                <canvas id="lrChart"></canvas>
+            </div>
+            <div class="chart-card">
+                <h3>📊 Paloma Evaluation Metrics</h3>
+                <canvas id="evalChart"></canvas>
+            </div>
+            <div class="chart-card">
+                <h3>🔄 Combined View</h3>
+                <canvas id="combinedChart"></canvas>
+            </div>
+        </div>
+        <div class="run-summary">
+            <h3>📋 Run Summary</h3>
+            <div id="runSummary"></div>
+        </div>
+        <div class="config-details">
+            <h3>⚙️ Model Configuration</h3>
+            <div id="configDetails"></div>
+        </div>
+    </div>
+    <script src="code.js"></script>
+</body>
+</html>

plots/style.css ADDED Viewed

	@@ -0,0 +1,258 @@

+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+body {
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    min-height: 100vh;
+    color: #333;
+}
+.container {
+    max-width: 1400px;
+    margin: 0 auto;
+    padding: 20px;
+}
+header {
+    text-align: center;
+    margin-bottom: 30px;
+    color: white;
+}
+header h1 {
+    font-size: 2.5rem;
+    margin-bottom: 10px;
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
+}
+header p {
+    font-size: 1.1rem;
+    opacity: 0.9;
+}
+.controls {
+    background: white;
+    padding: 20px;
+    border-radius: 12px;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+    margin-bottom: 30px;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    flex-wrap: wrap;
+    gap: 20px;
+}
+.run-selector select {
+    padding: 8px 16px;
+    border: 2px solid #e1e5e9;
+    border-radius: 8px;
+    font-size: 14px;
+    background: white;
+    cursor: pointer;
+    transition: border-color 0.3s ease;
+}
+.run-selector select:focus {
+    outline: none;
+    border-color: #667eea;
+}
+.metric-toggle {
+    display: flex;
+    gap: 20px;
+    flex-wrap: wrap;
+}
+.metric-toggle label {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    cursor: pointer;
+    font-weight: 500;
+    color: #555;
+}
+.metric-toggle input[type="checkbox"] {
+    width: 18px;
+    height: 18px;
+    accent-color: #667eea;
+}
+.charts-container {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(600px, 1fr));
+    gap: 30px;
+    margin-bottom: 30px;
+}
+.chart-card {
+    background: white;
+    padding: 25px;
+    border-radius: 12px;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+    transition: transform 0.3s ease, box-shadow 0.3s ease;
+}
+.chart-card:hover {
+    transform: translateY(-5px);
+    box-shadow: 0 12px 40px rgba(0,0,0,0.15);
+}
+.chart-card h3 {
+    margin-bottom: 20px;
+    color: #333;
+    font-size: 1.2rem;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.chart-card canvas {
+    max-height: 400px;
+    width: 100% !important;
+}
+.run-summary, .config-details {
+    background: white;
+    padding: 25px;
+    border-radius: 12px;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+    margin-bottom: 30px;
+}
+.run-summary h3, .config-details h3 {
+    margin-bottom: 20px;
+    color: #333;
+    font-size: 1.2rem;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.run-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+    gap: 20px;
+}
+.run-card {
+    background: #f8f9fa;
+    padding: 20px;
+    border-radius: 8px;
+    border-left: 4px solid #667eea;
+}
+.run-card h4 {
+    color: #667eea;
+    margin-bottom: 10px;
+    font-size: 1.1rem;
+}
+.run-card p {
+    margin-bottom: 8px;
+    color: #666;
+    font-size: 0.9rem;
+}
+.run-card .metric {
+    display: flex;
+    justify-content: space-between;
+    margin-bottom: 5px;
+}
+.run-card .metric .value {
+    font-weight: 600;
+    color: #333;
+}
+.config-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+    gap: 15px;
+}
+.config-item {
+    background: #f8f9fa;
+    padding: 15px;
+    border-radius: 8px;
+    text-align: center;
+}
+.config-item .label {
+    font-size: 0.8rem;
+    color: #666;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+    margin-bottom: 5px;
+}
+.config-item .value {
+    font-size: 1.2rem;
+    font-weight: 600;
+    color: #333;
+}
+@media (max-width: 768px) {
+    .container {
+        padding: 15px;
+    }
+    header h1 {
+        font-size: 2rem;
+    }
+    .controls {
+        flex-direction: column;
+        align-items: stretch;
+    }
+    .charts-container {
+        grid-template-columns: 1fr;
+    }
+    .chart-card {
+        padding: 20px;
+    }
+    .run-grid, .config-grid {
+        grid-template-columns: 1fr;
+    }
+}
+/* Chart.js customizations */
+.chartjs-tooltip {
+    background: rgba(0,0,0,0.8) !important;
+    color: white !important;
+    border-radius: 8px !important;
+    padding: 10px !important;
+    font-size: 12px !important;
+}
+/* Loading state */
+.loading {
+    text-align: center;
+    padding: 40px;
+    color: #666;
+}
+.loading::after {
+    content: '';
+    display: inline-block;
+    width: 20px;
+    height: 20px;
+    border: 3px solid #f3f3f3;
+    border-top: 3px solid #667eea;
+    border-radius: 50%;
+    animation: spin 1s linear infinite;
+    margin-left: 10px;
+}
+@keyframes spin {
+    0% { transform: rotate(0deg); }
+    100% { transform: rotate(360deg); }
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,33 @@

+[tool.poetry]
+name = "pico-train"
+version = "1.0.0"
+description = "A minimalistic framework for transparently training language models and storing comprehensive checkpoints for in-depth learning dynamics research"
+authors = ["Richard Diehl Martinez <richard@picolm.io>"]
+license = "Apache 2.0"
+readme = "README.md"
+packages = [{include = "src"}]
+[tool.poetry.scripts]
+train = "scripts.train:main"
+[tool.poetry.dependencies]
+python = "^3.10,<3.13"
+lightning = "^2.4.0"
+click = "^8.1.7"
+wandb = "^0.18.1"
+huggingface-hub = {extras = ["cli"], version = "^0.25.1"}
+datasets = "^3.0.1,<3.2.0"
+transformers = "^4.45.2"
+pre-commit = "^4.0.1"
+torch = "^2.5.1"
+evaluate = "^0.4.3"
+deepspeed = "^0.16.2"
+rich = "^13.9.4"
+[tool.poetry.group.dev.dependencies]
+ipykernel = "^6.29.5"
+jupyter = "^1.1.1"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

scripts/README.md ADDED Viewed

	@@ -0,0 +1,109 @@

+# Scripts Directory
+This directory contains utility scripts for the Pico training framework.
+## generate_data.py
+A script to automatically generate `data.json` from training log files for the dashboard.
+### What it does
+This script parses log files from the `runs/` directory and extracts:
+- **Training metrics**: Loss, learning rate, and inf/NaN counts at each step
+- **Evaluation results**: Paloma evaluation metrics
+- **Model configuration**: Architecture parameters (d_model, n_layers, etc.)
+### Usage
+```bash
+# Generate data.json from the default runs directory
+python scripts/generate_data.py
+# Specify custom runs directory
+python scripts/generate_data.py --runs-dir /path/to/runs
+# Specify custom output file
+python scripts/generate_data.py --output /path/to/output.json
+```
+### How it works
+1. **Scans runs directory**: Looks for subdirectories containing training runs
+2. **Finds log files**: Locates `.log` files in each run's `logs/` subdirectory
+3. **Parses log content**: Uses regex patterns to extract structured data
+4. **Generates JSON**: Creates a structured JSON file for the dashboard
+### Log Format Requirements
+The script expects log files with the following format:
+```
+2025-08-29 02:09:12 - pico-train - INFO - Step 500 -- 🔄 Training Metrics
+2025-08-29 02:09:12 - pico-train - INFO - ├── Loss: 10.8854
+2025-08-29 02:09:12 - pico-train - INFO - ├── Learning Rate: 3.13e-06
+2025-08-29 02:09:12 - pico-train - INFO - └── Inf/NaN count: 0
+```
+And evaluation results:
+```
+2025-08-29 02:15:26 - pico-train - INFO - Step 1000 -- 📊 Evaluation Results
+2025-08-29 02:15:26 - pico-train - INFO - └── paloma: 7.125172406420199e+27
+```
+### Output Format
+The generated `data.json` has this structure:
+```json
+{
+  "runs": [
+    {
+      "run_name": "model-name",
+      "log_file": "log_filename.log",
+      "training_metrics": [
+        {
+          "step": 0,
+          "loss": 10.9914,
+          "learning_rate": 0.0,
+          "inf_nan_count": 0
+        }
+      ],
+      "evaluation_results": [
+        {
+          "step": 1000,
+          "paloma": 59434.76600609756
+        }
+      ],
+      "config": {
+        "d_model": 96,
+        "n_layers": 12,
+        "max_seq_len": 2048,
+        "vocab_size": 50304,
+        "lr": 0.0003,
+        "max_steps": 200000,
+        "batch_size": 8
+      }
+    }
+  ],
+  "summary": {
+    "total_runs": 1,
+    "run_names": ["model-name"]
+  }
+}
+```
+### When to use
+- **After training**: Generate updated dashboard data
+- **Adding new runs**: Include new training sessions in the dashboard
+- **Debugging**: Verify log parsing is working correctly
+- **Dashboard setup**: Initial setup of the training metrics dashboard
+### Troubleshooting
+If the script doesn't find any data:
+1. Check that log files exist in `runs/*/logs/`
+2. Verify log format matches the expected pattern
+3. Ensure log files contain training metrics entries
+4. Check file permissions and encoding

scripts/generate_data.py ADDED Viewed

	@@ -0,0 +1,198 @@

+#!/usr/bin/env python3
+"""
+Script to generate data.json from training log files.
+This script parses log files from the runs directory and extracts:
+- Training metrics (loss, learning rate, inf/nan count)
+- Evaluation results (paloma metrics)
+- Model configuration parameters
+The output is saved to plots/data.json for the dashboard.
+"""
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]:
+    """Parse training metrics from log content."""
+    metrics = []
+    # Pattern to match training metrics entries with timestamp and log level
+    pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- 🔄 Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - ├── Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - ├── Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── Inf/NaN count: (\d+)"
+    matches = re.findall(pattern, log_content)
+    for step, loss, lr, inf_nan in matches:
+        metrics.append(
+            {
+                "step": int(step),
+                "loss": float(loss),
+                "learning_rate": float(lr),
+                "inf_nan_count": int(inf_nan),
+            }
+        )
+    return sorted(metrics, key=lambda x: x["step"])
+def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]:
+    """Parse evaluation results from log content."""
+    results = []
+    # Pattern to match evaluation results with timestamp and log level
+    pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- 📊 Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── paloma: ([\d.e+-]+)"
+    matches = re.findall(pattern, log_content)
+    for step, paloma in matches:
+        try:
+            paloma_value = float(paloma)
+            results.append({"step": int(step), "paloma": paloma_value})
+        except ValueError:
+            # Skip if paloma value is not a valid number (e.g., "inf")
+            continue
+    return sorted(results, key=lambda x: x["step"])
+def extract_config_from_log(log_content: str) -> Dict[str, Any]:
+    """Extract model configuration from log content."""
+    config = {}
+    # Extract key model parameters
+    patterns = {
+        "d_model": r"d_model: (\d+)",
+        "n_layers": r"n_layers: (\d+)",
+        "max_seq_len": r"max_seq_len: (\d+)",
+        "vocab_size": r"vocab_size: (\d+)",
+        "lr": r"lr: ([\d.e+-]+)",
+        "max_steps": r"max_steps: (\d+)",
+        "batch_size": r"batch_size: (\d+)",
+    }
+    for key, pattern in patterns.items():
+        match = re.search(pattern, log_content)
+        if match:
+            try:
+                if key in [
+                    "d_model",
+                    "n_layers",
+                    "max_seq_len",
+                    "vocab_size",
+                    "max_steps",
+                    "batch_size",
+                ]:
+                    config[key] = int(match.group(1))
+                else:
+                    config[key] = float(match.group(1))
+            except ValueError:
+                continue
+    return config
+def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]:
+    """Process a single run directory and extract all data."""
+    run_name = run_path.name
+    # Find log files
+    logs_dir = run_path / "logs"
+    if not logs_dir.exists():
+        return None
+    log_files = list(logs_dir.glob("*.log"))
+    if not log_files:
+        return None
+    # Use the most recent log file for configuration
+    latest_log = max(log_files, key=lambda x: x.stat().st_mtime)
+    # Read log content
+    log_content = latest_log.read_text(encoding="utf-8")
+    # Extract data
+    training_metrics = parse_training_metrics(log_content)
+    evaluation_results = parse_evaluation_results(log_content)
+    config = extract_config_from_log(log_content)
+    # If no training metrics found, skip this run
+    if not training_metrics:
+        return None
+    return {
+        "run_name": run_name,
+        "log_file": latest_log.name,
+        "training_metrics": training_metrics,
+        "evaluation_results": evaluation_results,
+        "config": config,
+    }
+def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"):
+    """Generate data.json from all run directories."""
+    runs_path = Path(runs_dir)
+    if not runs_path.exists():
+        print(f"Runs directory {runs_dir} not found!")
+        return
+    runs_data = []
+    # Process each run directory
+    for run_dir in runs_path.iterdir():
+        if run_dir.is_dir():
+            print(f"Processing run: {run_dir.name}")
+            run_data = process_run_directory(run_dir)
+            if run_data:
+                runs_data.append(run_data)
+                print(f"  ✓ Found {len(run_data['training_metrics'])} training metrics")
+                print(
+                    f"  ✓ Found {len(run_data['evaluation_results'])} evaluation results"
+                )
+            else:
+                print("  ✗ No valid data found")
+    if not runs_data:
+        print("No valid runs found!")
+        return
+    # Create output data structure
+    output_data = {
+        "runs": runs_data,
+        "summary": {
+            "total_runs": len(runs_data),
+            "run_names": [run["run_name"] for run in runs_data],
+        },
+    }
+    # Ensure output directory exists
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    # Write to file
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(output_data, f, indent=2, ensure_ascii=False)
+    print(f"\n✓ Generated {output_file} with {len(runs_data)} runs")
+    print(
+        f"✓ Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}"
+    )
+    print(
+        f"✓ Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}"
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Generate data.json from training logs"
+    )
+    parser.add_argument("--runs-dir", default="runs", help="Path to runs directory")
+    parser.add_argument("--output", default="plots/data.json", help="Output file path")
+    args = parser.parse_args()
+    generate_data_json(args.runs_dir, args.output)

scripts/train.py ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/usr/bin/env python3
+"""
+A minimal script to train the Pico language model. In practice, you should just use the
+`poetry run train` command to run the training pipeline. Doing so will invoke this script.
+Training logic is located in `src/training/trainer.py`.
+"""
+from pathlib import Path
+import click
+from src.training.trainer import Trainer
+@click.command()
+@click.option(
+    "--config_path",
+    "config_path",
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to the training configuration file",
+)
+def main(config_path: Path) -> None:
+    """Train the Pico language model using the specified configuration."""
+    trainer = Trainer(config_path=str(config_path))
+    trainer.train()
+if __name__ == "__main__":
+    main()

setup.sh ADDED Viewed

	@@ -0,0 +1,200 @@

+#!/bin/bash
+# This script sets up the project by installing dependencies, checking for a poetry environment,
+# and installing pre-commit hooks.
+# Add color and formatting variables at the top
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+BOLD='\033[1m'
+# Initialize error tracking
+ERRORS_FOUND=0
+# Function for section headers
+print_section() {
+    echo -e "\n${BOLD}${BLUE}=== $1 ===${NC}\n"
+}
+# Function for success messages
+print_success() {
+    echo -e "${GREEN}✓ $1${NC}"
+}
+# Function for warnings
+print_warning() {
+    echo -e "${YELLOW}⚠ $1${NC}"
+}
+# --- GIT LFS SETUP --- #
+print_section "Git LFS Setup"
+if ! command -v git-lfs &> /dev/null; then
+    print_warning "git-lfs is not installed. Some model checkpointing functionality may not work correctly."
+    ERRORS_FOUND=$((ERRORS_FOUND + 1))
+    # Check the operating system
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        # macOS
+        echo -e "${YELLOW}    You can install it using Homebrew:${NC}"
+        echo "    brew install git-lfs"
+    elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
+        # Linux
+        echo -e "${YELLOW}    You can install it using your package manager:${NC}"
+        if command -v apt-get &> /dev/null; then
+            # Ubuntu/Debian
+            echo "    curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash"
+            echo "    sudo apt-get install git-lfs"
+        elif command -v yum &> /dev/null; then
+            # CentOS/RHEL
+            echo "    curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | sudo bash"
+            echo "    sudo yum install git-lfs"
+        else
+            print_warning "Could not detect package manager. Please install git-lfs manually."
+        fi
+    else
+        print_warning "Unsupported operating system. Please install git-lfs manually."
+    fi
+else
+    git-lfs install
+    print_success "git-lfs installed and initialized"
+fi
+# --- CUDA VERSION CHECK --- #
+print_section "CUDA Version Check"
+if command -v nvidia-smi &> /dev/null; then
+    CUDA_VERSION=$(nvidia-smi | sed -n 's/.*CUDA Version: \([0-9.]*\).*/\1/p')
+    if [[ -z "$CUDA_VERSION" ]]; then
+        ERRORS_FOUND=$((ERRORS_FOUND + 1))
+        print_warning "nvidia-smi failed to communicate with the NVIDIA driver."
+        echo -e "${YELLOW}    Ensure that the latest NVIDIA driver is installed and running.${NC}"
+    else
+        MAJOR_VERSION=${CUDA_VERSION%.*}
+        MINOR_VERSION=${CUDA_VERSION#*.}
+        if [ "$MAJOR_VERSION" -lt 12 ] || ([ "$MAJOR_VERSION" -eq 12 ] && [ "$MINOR_VERSION" -lt 1 ]); then
+            ERRORS_FOUND=$((ERRORS_FOUND + 1))
+            print_warning "CUDA version ${MAJOR_VERSION}.${MINOR_VERSION} detected."
+            echo -e "${YELLOW}    Some multi-node communication GPU features may not work properly.${NC}"
+            echo -e "${YELLOW}    CUDA version 12.1 or newer is recommended.${NC}"
+        else
+            print_success "CUDA version ${MAJOR_VERSION}.${MINOR_VERSION} detected"
+        fi
+    fi
+else
+    ERRORS_FOUND=$((ERRORS_FOUND + 1))
+    print_warning "nvidia-smi not found. Unable to check CUDA version."
+    echo -e "${YELLOW}    Ensure that NVIDIA drivers and CUDA version at 12.1 or newer are installed for GPU support.${NC}"
+fi
+# ---- ENVIRONMENT VARIABLES ---- #
+print_section "Environment Variables"
+if [ -f .env ]; then
+    print_success "Loading environment variables from .env..."
+    source .env
+    if [[ -n "$HF_TOKEN" && -n "$WANDB_API_KEY" ]]; then
+        print_success "Both HF_TOKEN and WANDB_API_KEY are set and loaded!"
+    else
+        print_warning "One or both of HF_TOKEN and WANDB_API_KEY are not set."
+    fi
+else
+    print_warning "No .env file found."
+    echo -e "${YELLOW}    You might need to create one with HF_TOKEN and WANDB_API_KEY${NC}"
+    echo -e "${YELLOW}    Example .env contents:${NC}"
+    echo "    export HF_TOKEN=your_huggingface_token"
+    echo "    export WANDB_API_KEY=your_wandb_key"
+    ERRORS_FOUND=$((ERRORS_FOUND + 1))
+fi
+# ---- POETRY SETUP ---- #
+print_section "Poetry Setup"
+# First check if Poetry is installed
+if ! command -v poetry &> /dev/null; then
+    echo "Poetry not found. Installing..."
+    # Run the installation command
+    curl -sSL https://install.python-poetry.org | python3 -
+    POETRY_INSTALL_STATUS=$?
+    if [ $POETRY_INSTALL_STATUS -ne 0 ]; then
+        print_warning "Poetry installation failed!"
+        ERRORS_FOUND=$((ERRORS_FOUND + 1))
+    else
+        export PATH="$HOME/.local/bin:$PATH"
+        # Verify installation succeeded
+        if ! command -v poetry &> /dev/null; then
+            print_warning "Poetry was installed but cannot be found in PATH!"
+            echo -e "${YELLOW}    Try adding this to your shell profile:${NC}"
+            echo "    export PATH=\"\$HOME/.local/bin:\$PATH\""
+            ERRORS_FOUND=$((ERRORS_FOUND + 1))
+        else
+            print_success "Poetry installed successfully"
+        fi
+    fi
+else
+    print_success "Poetry already installed"
+fi
+# Then check for virtual environment
+if [ ! -d ".venv" ]; then
+    echo "No virtual environment found. Creating one..."
+    poetry config virtualenvs.in-project true
+    # Create virtual environment and install dependencies
+    poetry install --with dev
+    POETRY_VENV_STATUS=$?
+    if [ $POETRY_VENV_STATUS -ne 0 ]; then
+        print_warning "Failed to create Poetry virtual environment!"
+        ERRORS_FOUND=$((ERRORS_FOUND + 1))
+    else
+        print_success "Poetry environment created successfully"
+    fi
+else
+    print_success "Poetry environment already exists"
+fi
+# ---- PRE-COMMIT SETUP ---- #
+print_section "Pre-commit Setup"
+# Install pre-commit hooks
+echo "Installing pre-commit hooks..."
+poetry run pre-commit install
+if [ $? -ne 0 ]; then
+    print_warning "Failed to install pre-commit hooks!"
+    ERRORS_FOUND=$((ERRORS_FOUND + 1))
+else
+    print_success "Pre-commit hooks installed"
+fi
+# Run pre-commit hooks on all files
+echo "Running pre-commit hooks on all files..."
+poetry run pre-commit run --all-files
+if [ $? -ne 0 ]; then
+    print_warning "Pre-commit encountered issues with some files"
+    ERRORS_FOUND=$((ERRORS_FOUND + 1))
+else
+    print_success "Pre-commit initial run complete"
+fi
+# --- Final Status Message --- #
+# Final status message
+print_section "Setup Status"
+if [ $ERRORS_FOUND -eq 0 ]; then
+    print_success "Setup Complete! 🎉"
+    print_success "To activate the virtual environment, run: poetry env activate"
+else
+    print_warning "Setup completed with warnings and errors! Please check the messages above."
+    echo -e "${YELLOW}    ${ERRORS_FOUND} issue(s) were detected that may affect functionality.${NC}"
+    if [ -d ".venv" ]; then
+        echo -e "${YELLOW}    You can still activate the environment with: poetry env activate${NC}"
+    else
+        echo -e "${RED}    The virtual environment setup failed. Fix the issues before proceeding.${NC}"
+    fi
+fi

src/checkpointing/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Pico Checkpointing Package
+We subdivide the checkpointing into training, evaluation, and learning_dynamics. Training
+checkpoints store the model, optimizer, and learning rate scheduler. Evaluation checkpoints store
+the evaluation results on the defined metrics. Learning dynamics checkpoints store activations and gradients used for
+learning dynamics analysis.
+"""
+from .evaluation import save_evaluation_results
+from .learning_dynamics import (
+    compute_learning_dynamics_states,
+    save_learning_dynamics_states,
+)
+from .training import load_checkpoint, save_checkpoint
+__all__ = [
+    "compute_learning_dynamics_states",
+    "load_checkpoint",
+    "save_checkpoint",
+    "save_evaluation_results",
+    "save_learning_dynamics_states",
+]

src/checkpointing/evaluation.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Utilities for checkpointing evaluation-related states (i.e. evaluation results, etc.)
+We save the evaluation results in a JSON file at the step-specific evaluation results directory.
+"""
+import json
+import os
+from typing import Any, Dict
+from huggingface_hub import upload_folder
+from lightning.fabric import Fabric
+from lightning.fabric.utilities.rank_zero import rank_zero_only
+from src.config import CheckpointingConfig
+from src.training.utils.io import use_backoff
+@rank_zero_only
+@use_backoff()
+def save_evaluation_results(
+    checkpointing_config: CheckpointingConfig,
+    checkpoint_step: int,
+    fabric: Fabric,
+    evaluation_results: Dict[str, Any],
+) -> None:
+    """Save evaluation results to disk and optionally to HuggingFace Hub.
+    The evaluation results are saved in the following directory structure:
+    {checkpointing_config.runs_dir}/
+        └── {checkpointing_config.run_name}/
+            └── {checkpointing_config.eval_results_dir}/
+                └── step_{checkpoint_step}.json
+    NOTE: this function is only called on rank 0 to avoid conflicts; assumes that the evaluation
+    results are gathered on rank 0.
+    Args:
+        checkpointing_config: Configuration object containing checkpoint settings
+        checkpoint_step: Current training checkpoint step (i.e. number of learning steps taken)
+        fabric: Lightning Fabric instance
+        evaluation_results: Dictionary containing evaluation metrics
+    """
+    run_dir = os.path.join(checkpointing_config.runs_dir, checkpointing_config.run_name)
+    eval_results_dir = os.path.join(
+        run_dir, checkpointing_config.evaluation.eval_results_dir
+    )
+    os.makedirs(eval_results_dir, exist_ok=True)
+    curr_eval_results_path = os.path.join(
+        eval_results_dir, f"step_{checkpoint_step}.json"
+    )
+    # save out as json
+    with open(curr_eval_results_path, "w") as f:
+        json.dump(evaluation_results, f)
+    if checkpointing_config.save_to_hf:
+        upload_folder(
+            folder_path=eval_results_dir,
+            path_in_repo=checkpointing_config.evaluation.eval_results_dir,
+            repo_id=checkpointing_config.hf_checkpoint.repo_id,
+            commit_message=f"Saving Evaluation Results -- Step {checkpoint_step}",
+            revision=checkpointing_config.run_name,
+            token=os.getenv("HF_TOKEN"),
+        )

src/checkpointing/learning_dynamics.py ADDED Viewed

	@@ -0,0 +1,424 @@

+"""
+Utilities for checkpointing learning dynamics-related states (i.e. activations, weights, grads, etc.)
+We save the learning dynamics states in a subdirectory of the checkpointing directory.
+"""
+import os
+import re
+from typing import Dict, Optional
+import deepspeed
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from datasets import Dataset
+from huggingface_hub import upload_folder
+from lightning.fabric import Fabric
+from lightning.fabric.strategies import DeepSpeedStrategy
+from lightning.fabric.utilities.rank_zero import rank_zero_only
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+from src.config import CheckpointingConfig
+from src.config.checkpointing_config import LearningDynamicsCheckpointingConfig
+from src.training.utils.initialization import initialize_model
+from src.training.utils.io import use_backoff
+# NOTE: DeepSpeed requires a dummy optimizer to be passed in to the setup function
+class DummyOptimizer(optim.Optimizer):
+    def __init__(self, params):
+        super().__init__(params, defaults={})
+class CheckpointStateExtractor:
+    """
+    Class to extract and save the states of a model at a given checkpoint step for learning
+    dynamics research.
+    """
+    def __init__(
+        self,
+        learning_dynamics_config: LearningDynamicsCheckpointingConfig,
+        fabric: Fabric,
+        model: nn.Module,
+    ):
+        self.learning_dynamics_config = learning_dynamics_config
+        self.fabric = fabric
+        self.model = model
+    def extract_states(self, dataloader, compute_gradients: bool = False):
+        """Extracts model states (activations, weights, and optionally gradients).
+        Given a dataloader, this function will perform a forward pass of the model on each batch,
+        and save the activations and weights at each layer. If compute_gradients is True, it will
+        also compute the gradients of the model parameters.
+        Args:
+            dataloader: The dataloader containing the dataset to extract states from.
+            compute_gradients: Whether to compute the gradients of the model parameters.
+        Returns:
+            A dictionary containing the activations, weights, and optionally gradients of the model.
+        """
+        checkpoint_activations = {}
+        checkpoint_weights = {}
+        # NOTE: to extract activations and weights, we need to setup forward hooks on the layers
+        # of the model that we are interested in. This is a good intro to forward hooks if you
+        # are not familiar: https://web.stanford.edu/~nanbhas/blog/forward-hooks-pytorch/
+        forward_hooks = self._setup_forward_hooks(
+            checkpoint_activations,
+            checkpoint_weights,
+        )
+        ########################################################
+        #
+        # Forward Pass: Extract activations and weights; and compute gradients
+        #
+        ########################################################
+        for sub_batch in dataloader:
+            _input_ids = torch.tensor(sub_batch["input_ids"], device=self.fabric.device)
+            if compute_gradients:
+                if "labels" in sub_batch:
+                    input_ids = _input_ids
+                    labels = torch.tensor(
+                        sub_batch["labels"], device=self.fabric.device
+                    )
+                else:
+                    input_ids = _input_ids[:, :-1]
+                    labels = _input_ids[:, 1:]
+            else:
+                input_ids = _input_ids
+                labels = None
+            if labels is None:
+                # we can throw away the outputs, we are only interested in the hidden states
+                with torch.no_grad():
+                    _ = self.model(input_ids)
+            else:
+                # NOTE: if we are computing gradients, calling backwards will compute the gradients
+                # of the model parameters.
+                outputs, _ = self.model(input_ids)
+                outputs = outputs.transpose(1, 2)
+                loss = F.cross_entropy(outputs, labels)
+                self.fabric.backward(loss, model=self.model)
+        # cleanup forward hooks
+        # NOTE this is not strictly necessary, since self.model is a deepcopy of the original model
+        # but it is good practice to remove the hooks after the forward pass is complete.
+        for hook in forward_hooks:
+            hook.remove()
+        ########################################################
+        #
+        # Extract gradients from the target tensors of the model
+        #
+        ########################################################
+        layer_suffixes = self.learning_dynamics_config.layer_suffixes
+        checkpoint_gradients = {}
+        if compute_gradients:
+            for name, param in self.model.named_parameters():
+                # only do this for the weight matrix of the layer_suffixes
+                if (
+                    any(layer_suffix in name for layer_suffix in layer_suffixes)
+                    and "weight" in name
+                ):
+                    if isinstance(self.fabric.strategy, DeepSpeedStrategy):
+                        _grad = deepspeed.utils.safe_get_full_grad(param)
+                    else:
+                        _grad = param.grad
+                    assert _grad is not None, f"Gradient is None for layer: {name}"
+                    name = re.sub(r"\.weight", "", name)
+                    checkpoint_gradients[name] = _grad.detach().cpu()
+        # zero out the gradients
+        self.model.zero_grad()
+        return checkpoint_activations, checkpoint_weights, checkpoint_gradients
+    ########################################################
+    #
+    # Setup forward hooks to save activations and weights at each layer
+    #
+    ########################################################
+    def _setup_forward_hooks(self, checkpoint_activations, checkpoint_weights):
+        """Setup forward hooks for the model to save activations and weights at each layer.
+        This function will setup forward hooks on the layers of the model that we are interested in.
+        The forward hooks will save the activations and weights at each layer whenever the forward pass
+        is performed.
+        Args:
+            checkpoint_activations: A dictionary to store the activations at each layer.
+            checkpoint_weights: A dictionary to store the weights at each layer.
+        Returns:
+            A list of forward hooks. We do this so that we can remove the hooks after the forward pass
+            is complete.
+        """
+        forward_hooks = []
+        layer_suffixes = self.learning_dynamics_config.layer_suffixes
+        for name, module in self.model.named_modules():
+            if any(layer_suffix in name for layer_suffix in layer_suffixes):
+                _forward_hook = module.register_forward_hook(
+                    self._get_forward_hook(
+                        name, checkpoint_activations, checkpoint_weights
+                    )
+                )
+                forward_hooks.append(_forward_hook)
+        return forward_hooks
+    def _get_forward_hook(
+        self, module_name, checkpoint_activations, checkpoint_weights
+    ):
+        """Get a forward hook for a given module.
+        This function is called by the _setup_forward_hooks function to setup a forward hook for a given
+        module. This functions is a closure that captures the module_name, checkpoint_activations, and
+        checkpoint_weights.
+        Args:
+            module_name: The name of the module to setup a forward hook for.
+            checkpoint_activations: A dictionary to store the activations at each layer.
+            checkpoint_weights: A dictionary to store the weights at each layer.
+        Returns:
+            A forward hook for the given module.
+        """
+        def _forward_hook(module, _, module_out):
+            sequence_idx = self.learning_dynamics_config.sequence_idx
+            local_activations = module_out[:, sequence_idx, :].detach()
+            # Gather activations from all processes using fabric
+            gathered_activations = self.fabric.all_gather(local_activations)
+            # Reshape from [num_processes, batch_size, hidden_dim] to [total_batch_size, hidden_dim]
+            # NOTE: transposing allows us to interleave the activations from each process so that
+            # they are in the correct order. (i.e. activation N is from data sample N)
+            gathered_activations = gathered_activations.transpose(0, 1).reshape(
+                -1, gathered_activations.shape[-1]
+            )
+            # check if there is already a key for the module name
+            if module_name not in checkpoint_activations:
+                # if there is no key, then we create a new key and store the hidden states
+                checkpoint_activations[module_name] = (
+                    gathered_activations.detach().cpu()
+                )
+                # extract the weight matrix just once
+                weight_matrix = module.weight.detach().cpu()
+                checkpoint_weights[module_name] = weight_matrix
+            else:
+                # if there is already a key, then we concatenate the new hidden states to the existing ones
+                checkpoint_activations[module_name] = torch.cat(
+                    (
+                        checkpoint_activations[module_name],
+                        gathered_activations.detach().cpu(),
+                    )
+                )
+        return _forward_hook
+def compute_learning_dynamics_states(
+    checkpointing_config: CheckpointingConfig,
+    fabric: Fabric,
+    model: nn.Module,
+    dataset: Dataset,
+    compute_gradients: bool = False,
+) -> Dict[str, torch.Tensor]:
+    """Computes the learning dynamics metrics for a given checkpoint step.
+    Uses the CheckpointStateExtractor to extract the activations, weights, and optionally gradients
+    of the model at a given checkpoint step.
+    Args:
+        checkpointing_config: The configuration object for checkpointing.
+        fabric: The Fabric instance for distributed training.
+        model: The model to extract states from.
+        dataset: The dataset to extract states from.
+        compute_gradients: Whether to compute the gradients of the model parameters.
+    Returns:
+        A dictionary containing the activations, weights, and optionally gradients of the model.
+    """
+    # NOTE: Synchronizing processes for fabric dataloader setup
+    fabric.barrier()
+    model.to("cpu")  # Offloading model to CPU
+    # Setting up Dataloader for learning dynamics
+    def _collate_fn(batch):
+        return {"input_ids": [entry["input_ids"] for entry in batch]}
+    batch_size = checkpointing_config.learning_dynamics.batch_size
+    sub_batch_size = batch_size // fabric.world_size
+    # NOTE: Make sure to set drop_last to False, otherwise the last batch will be dropped
+    # and we will not have a complete set of activations for the last sample. Also,
+    # we need to set shuffle to False, otherwise the activations will be shuffled across
+    # processes and we will not be able to interleave them correctly.
+    extractor_dataloader = DataLoader(
+        dataset,
+        batch_size=sub_batch_size,
+        shuffle=False,
+        collate_fn=_collate_fn,
+        drop_last=False,
+    )
+    extractor_dataloader = fabric.setup_dataloaders(
+        extractor_dataloader, use_distributed_sampler=True
+    )
+    # Create a new model instance with same parameters but zero gradients
+    _model = initialize_model(model.config)
+    _model.load_state_dict(model.state_dict())
+    if isinstance(fabric.strategy, DeepSpeedStrategy):
+        _model, _ = fabric.setup(_model, DummyOptimizer(_model.parameters()))
+    else:
+        _model = fabric.setup(_model)
+    _model.zero_grad()
+    # setup forward hooks for the model to save activations and weights at each layer
+    state_extractor = CheckpointStateExtractor(
+        checkpointing_config.learning_dynamics, fabric, _model
+    )
+    checkpoint_activations, checkpoint_weights, checkpoint_gradients = (
+        state_extractor.extract_states(
+            extractor_dataloader, compute_gradients=compute_gradients
+        )
+    )
+    del _model
+    torch.cuda.empty_cache()
+    # NOTE: Synchronizing processes for model setup
+    fabric.barrier()
+    model.to(fabric.device)
+    # NOTE: Trimming down the activations to match the dataset size;
+    # This is because the DataSampler might add extra samples to the dataset to make it evenly divisible
+    # by the number of processes. We need to remove these extra samples.
+    for layer_name, layer_activations in checkpoint_activations.items():
+        if len(layer_activations) > len(dataset):
+            checkpoint_activations[layer_name] = layer_activations[: len(dataset)]
+        elif len(layer_activations) < len(dataset):
+            raise ValueError(
+                f"Number of activations ({len(layer_activations)}) in layer {layer_name} does not match number of samples in dataset ({len(dataset)})"
+            )
+    return {
+        "activations": checkpoint_activations,
+        "weights": checkpoint_weights,
+        "gradients": checkpoint_gradients,
+    }
+@rank_zero_only
+@use_backoff()
+def save_learning_dynamics_states(
+    checkpointing_config: CheckpointingConfig,
+    checkpoint_step: int,
+    prefix: str,
+    fabric: Fabric,
+    learning_dynamics_states: Dict[str, torch.Tensor],
+    learning_dynamics_dataset: Optional[Dataset] = None,
+    tokenizer: Optional[PreTrainedTokenizerBase] = None,
+) -> None:
+    """Save the learning dynamics metrics to the checkpointing directory.
+    By default only the learning dynamics states are saved. If the learning dynamics dataset
+    is provided, it is also saved; if a tokenizer is provided, the dataset is also detokenized
+    (i.e. a new column with the text is added to the dataset).
+    The learning dynamics dataset is saved in the checkpointing directory as a HuggingFace
+    dataset.
+    Creates a versioned checkpoint directory with the following structure:
+    {checkpointing_config.runs_dir}/
+        └── {checkpointing_config.run_name}/
+            └── {checkpointing_config.checkpoints_dir}/
+                ├── step_{checkpoint_step}/
+                │   └── {checkpointing_config.learning_dynamics_dir}/ # Learning Dynamics files
+                │      ├── {prefix}_activations.pt
+                │      ├── {prefix}_weights.pt
+                │      └── {prefix}_gradients.pt
+                │      └── {prefix}_data/ # if learning_dynamics_dataset is provided
+                └── latest -> step_{checkpoint_step}/
+    NOTE: this function is only called on rank 0
+    Args:
+        checkpointing_config: The configuration object for checkpointing.
+        checkpoint_step: The checkpoint step at which the learning dynamics states were computed.
+        prefix: The prefix for the learning dynamics states.
+        fabric: The Fabric instance for distributed training.
+        learning_dynamics_states: The learning dynamics states to save.
+        learning_dynamics_dataset: The dataset containing learning dynamics data,
+            including input IDs that need to be decoded. (optional)
+        tokenizer: The tokenizer used to decode input IDs into text. (optional)
+    """
+    runs_dir = checkpointing_config.runs_dir
+    run_name = checkpointing_config.run_name
+    checkpoints_dir = checkpointing_config.checkpoints_dir
+    learning_dynamics_dir = checkpointing_config.learning_dynamics_dir
+    run_path = os.path.join(runs_dir, run_name)
+    root_checkpoint_path = os.path.join(run_path, checkpoints_dir)
+    checkpoint_path = os.path.join(root_checkpoint_path, f"step_{checkpoint_step}")
+    learning_dynamics_path = os.path.join(checkpoint_path, learning_dynamics_dir)
+    os.makedirs(learning_dynamics_path, exist_ok=True)
+    # save the learning dynamics states
+    for key, value in learning_dynamics_states.items():
+        if value is not None and len(value) > 0:
+            torch.save(
+                value, os.path.join(learning_dynamics_path, f"{prefix}_{key}.pt")
+            )
+    if learning_dynamics_dataset is not None:
+        if tokenizer is not None:
+            # go through dataset and decode the input ids; and add back into dataset
+            detokenized_dataset = {"input_ids": [], "text": []}
+            for entry in learning_dynamics_dataset:
+                input_ids = entry["input_ids"]
+                decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)
+                detokenized_dataset["input_ids"].append(input_ids)
+                detokenized_dataset["text"].append(decoded_text)
+            learning_dynamics_dataset = Dataset.from_dict(detokenized_dataset)
+        learning_dynamics_dataset_path = os.path.join(
+            learning_dynamics_path, f"{prefix}_data"
+        )
+        learning_dynamics_dataset.save_to_disk(learning_dynamics_dataset_path)
+    if checkpointing_config.save_to_hf:
+        # Upload the HF model
+        upload_folder(
+            folder_path=learning_dynamics_path,
+            path_in_repo=learning_dynamics_dir,
+            repo_id=checkpointing_config.hf_checkpoint.repo_id,
+            commit_message=f"Saving Learning Dynamics Data ({prefix}) -- Step {checkpoint_step}",
+            revision=checkpointing_config.run_name,
+            token=os.getenv("HF_TOKEN"),
+        )

src/checkpointing/training.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+Utilities for checkpointing training-related states (i.e. model, optimizer, lr_scheduler, etc.)
+We save both a HuggingFace model and a Fabric-specific checkpoint. The HuggingFace model is
+saved at the step-specific checkpoint directory, while the Fabric-specific checkpoint is saved
+in a subdirectory. This is done to facilitate easier versioning of the HuggingFace model files
+(which are what gets uploaded to the Hub).
+"""
+import os
+from dataclasses import asdict
+from typing import Any, Dict, Tuple, Union
+import yaml
+from huggingface_hub import upload_file, upload_folder
+from lightning.fabric import Fabric
+from lightning.fabric.strategies import DeepSpeedStrategy
+from lightning.fabric.utilities.seed import _collect_rng_states, _set_rng_states
+from torch import nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+from transformers import PreTrainedTokenizerBase
+from src.config import CheckpointingConfig
+from src.training.utils.io import use_backoff
+@use_backoff()
+def load_checkpoint(
+    checkpointing_config: CheckpointingConfig,
+    checkpoint_step: Union[str, int],
+    fabric: Fabric,
+    model: nn.Module,
+    optimizer: Optimizer,
+    lr_scheduler: LRScheduler,
+) -> Tuple[nn.Module, Optimizer, LRScheduler, int]:
+    """Load model checkpoint and associated states from a given step.
+    Args:
+        checkpointing_config: Configuration object containing checkpoint settings
+        checkpoint_step: The step at which to load the checkpoint
+        fabric: Lightning Fabric instance for distributed training support
+        model: The model instance to load weights into
+        optimizer: The optimizer instance to load states into
+        lr_scheduler: The learning rate scheduler to load states into
+    Returns:
+        Tuple containing the model, optimizer, lr_scheduler, and checkpoint step.
+        Returns None if no checkpoint is found.
+    """
+    if isinstance(checkpoint_step, int):
+        checkpoint_step = f"step_{checkpoint_step}"
+    checkpoint_path = os.path.join(
+        checkpointing_config.runs_dir,
+        checkpointing_config.run_name,
+        checkpointing_config.checkpoints_dir,
+        checkpoint_step,
+    )
+    if not os.path.exists(checkpoint_path):
+        return None
+    # Load from specified fabric checkpoint subdirectory
+    fabric_checkpoint_path = os.path.join(
+        checkpoint_path, checkpointing_config.fabric_checkpoint_dir
+    )
+    checkpoint_state = {
+        "_model": model,
+        "_optimizer": optimizer,
+        "_lr_scheduler": lr_scheduler,
+    }
+    if not isinstance(fabric.strategy, DeepSpeedStrategy):
+        fabric_load_file = os.path.join(
+            fabric_checkpoint_path, checkpointing_config.fabric_checkpoint_filename
+        )
+    else:
+        # Deepspeed checkpoints create sub-directory with distributed checkpoint file
+        fabric_load_file = fabric_checkpoint_path
+    extra_state = fabric.load(os.path.join(fabric_load_file), state=checkpoint_state)
+    # NOTE: extra_state will contain any additional states that were saved in the checkpoint
+    checkpoint_step = extra_state["_checkpoint_step"]
+    if "_rng_states" in extra_state:
+        _rng_states = extra_state["_rng_states"]
+        _set_rng_states(_rng_states)
+    return model, optimizer, lr_scheduler, checkpoint_step
+@use_backoff()
+def save_checkpoint(
+    configs: Dict[str, Any],
+    checkpoint_step: int,
+    fabric: Fabric,
+    model: nn.Module,
+    optimizer: Optimizer,
+    lr_scheduler: LRScheduler,
+    tokenizer: PreTrainedTokenizerBase,
+    upload_logs: bool = False,
+) -> None:
+    """Save training checkpoint and associated states to disk and optionally to HuggingFace Hub.
+    We save the following files:
+    - HuggingFace model files (config.json, pytorch_model.bin)
+    - Tokenizer files (vocab.json, merges.txt)
+    - Fabric-specific files - fabric state of the model, optimizer, and lr_scheduler. If using
+      DeepSpeed, the checkpoint is saved in a subdirectory, otherwise it is saved in a single file.
+    Note that the HuggingFace model files are saved at the step-specific checkpoint directory, while the
+    Fabric-specific files are saved in a subdirectory. This is done to facilitate easier
+    versioning of the HuggingFace model files (which are what gets uploaded to the Hub).
+    NOTE: Why do we save a HF model at all? We do this because it makes it easier to load the model
+    in a separate script for evaluation and to play nicely with the HuggingFace Hub.
+    Creates a versioned checkpoint directory with the following structure:
+    {checkpointing_config.runs_dir}/
+        └── {checkpointing_config.run_name}/
+            └── training_config.yaml           # Training config
+            └── {checkpointing_config.checkpoints_dir}/
+                ├── step_{checkpoint_step}/
+                │   ├── config.json                    # HuggingFace model config
+                │   ├── model.safetensors              # HuggingFace model weights
+                │   ├── pico_{model_type}.py           # HuggingFace custom model class
+                │   ├── tokenizer.json                 # Tokenizer vocab
+                │   ├── tokenizer_config.json          # Tokenizer config
+                │   └── {checkpointing_config.fabric_checkpoint_dir}/  # Fabric-specific files
+                │       └── checkpoint/                # Distributed model checkpoint files (if using DeepSpeed)
+                │           OR
+                │       └── checkpoint.pt              # Single checkpoint file (if using other strategies)
+                └── latest -> step_{checkpoint_step}/
+    Args:
+        configs: A dictionary containing the initialized configuration objects.
+        checkpoint_step: The current training checkpoint step (i.e. number of learning steps taken)
+        fabric: Lightning Fabric instance for distributed training support
+        model: The model instance to save
+        optimizer: The optimizer instance to save
+        lr_scheduler: The learning rate scheduler to save
+        tokenizer: The tokenizer to save
+        upload_logs: Whether to upload training logs to HF Hub (default: False)
+    """
+    checkpointing_config = configs["checkpointing"]
+    # Get the directories from the training config
+    runs_dir = checkpointing_config.runs_dir
+    checkpoints_dir = checkpointing_config.checkpoints_dir
+    fabric_checkpoint_dir = checkpointing_config.fabric_checkpoint_dir
+    logs_dir = checkpointing_config.logs_dir
+    run_path = os.path.join(runs_dir, checkpointing_config.run_name)
+    root_checkpoint_path = os.path.join(run_path, checkpoints_dir)
+    checkpoint_path = os.path.join(root_checkpoint_path, f"step_{checkpoint_step}")
+    # Create directories
+    os.makedirs(checkpoint_path, exist_ok=True)
+    ########################################################
+    #
+    # Save HuggingFace files
+    #
+    ########################################################
+    # NOTE: we convert the Pico model to a HuggingFace model before saving it. See `model.py`
+    # for more details.
+    if fabric.global_rank == 0:
+        hf_model = model.convert_to_hf_model()
+        hf_model.save_pretrained(checkpoint_path)
+        tokenizer.save_pretrained(checkpoint_path)
+    ########################################################
+    #
+    # Save Fabric-specific files
+    #
+    ########################################################
+    # Create fabric-specific subdirectory
+    fabric_checkpoint_path = os.path.join(checkpoint_path, fabric_checkpoint_dir)
+    os.makedirs(fabric_checkpoint_path, exist_ok=True)
+    # Save model states (use underscore to avoid conflicts with third-party libraries)
+    checkpoint_state = {
+        "_model": model,
+        "_optimizer": optimizer,
+        "_lr_scheduler": lr_scheduler,
+        "_checkpoint_step": checkpoint_step,
+    }
+    if not isinstance(fabric.strategy, DeepSpeedStrategy):
+        checkpoint_state["_rng_states"] = _collect_rng_states()
+        fabric_save_file = os.path.join(
+            fabric_checkpoint_path, checkpointing_config.fabric_checkpoint_filename
+        )
+    else:
+        # Deepspeed checkpoints create sub-directory with distributed checkpoint file
+        fabric_save_file = fabric_checkpoint_path
+    fabric.save(fabric_save_file, checkpoint_state)
+    if fabric.global_rank == 0:
+        # Save config in fabric directory
+        config_path = os.path.join(run_path, "training_config.yaml")
+        if not os.path.exists(config_path):
+            # Converting dataclasses to joined dicts and saving to file
+            _training_config = {}
+            for config_name, config in configs.items():
+                _training_config[config_name] = asdict(config)
+            with open(config_path, "w") as f:
+                yaml.dump(_training_config, f)
+        # Update latest symlink
+        latest_symlink_path = os.path.join(root_checkpoint_path, "latest")
+        if os.path.lexists(latest_symlink_path):
+            os.remove(latest_symlink_path)
+        os.symlink(
+            f"step_{checkpoint_step}", latest_symlink_path, target_is_directory=True
+        )
+    ########################################################
+    #
+    # Push to HuggingFace Hub (if configured)
+    #
+    ########################################################
+    if fabric.global_rank == 0:
+        # Push only on rank zero thread
+        if checkpointing_config.save_to_hf:
+            repo_id = checkpointing_config.hf_checkpoint.repo_id
+            # Upload the HF model
+            hf_model.push_to_hub(
+                repo_id=repo_id,
+                commit_message=f"Saving HF Model -- Step {checkpoint_step}",
+                revision=checkpointing_config.run_name,
+                token=os.getenv("HF_TOKEN"),
+            )
+            if checkpoint_step == 0:
+                # Uploading Tokenizer during first step since it never changes
+                tokenizer.push_to_hub(
+                    repo_id=repo_id,
+                    commit_message=f"Saving Tokenizer -- Step {checkpoint_step}",
+                    revision=checkpointing_config.run_name,
+                    token=os.getenv("HF_TOKEN"),
+                )
+                # Upload training config, also only in first step
+                upload_file(
+                    path_or_fileobj=config_path,
+                    path_in_repo="training_config.yaml",
+                    repo_id=repo_id,
+                    commit_message=f"Saving Training Config -- Step {checkpoint_step}",
+                    revision=checkpointing_config.run_name,
+                    token=os.getenv("HF_TOKEN"),
+                )
+            # Upload the fabric checkpoint directory
+            upload_folder(
+                folder_path=fabric_checkpoint_path,
+                path_in_repo=fabric_checkpoint_dir,
+                repo_id=repo_id,
+                commit_message=f"Saving Fabric Checkpoint -- Step {checkpoint_step}",
+                revision=checkpointing_config.run_name,
+                token=os.getenv("HF_TOKEN"),
+            )
+            # Upload logs if requested
+            if upload_logs:
+                logs_path = os.path.join(run_path, logs_dir)
+                upload_folder(
+                    folder_path=logs_path,
+                    path_in_repo=logs_dir,
+                    repo_id=repo_id,
+                    commit_message=f"Saving Logs -- Step {checkpoint_step}",
+                    revision=checkpointing_config.run_name,
+                    token=os.getenv("HF_TOKEN"),
+                )

src/config/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+Pico Config Package
+The modules of this package are where you can specify the hyperparameters for the Pico model,
+the dataset, the training process, evaluation, etc.
+As with anything else in Pico, we've designed for the configuration setup to be as flexible
+as possible. By default the configs are implemented as vanilla dataclasses -- this makes it easy to
+switch to different config management systems if you want, like hydra.
+Some things to NOTE:
+- All hyperparameters are initialized with default values, which can be overridden.
+- The default vocab size is set to the size of the OLMo tokenizer.
+"""
+# For convenience, we export the config classes here
+from .checkpointing_config import CheckpointingConfig
+from .data_config import DataConfig
+from .evaluation_config import EvaluationConfig
+from .model_config import ModelConfig
+from .monitoring_config import MonitoringConfig
+from .training_config import TrainingConfig
+__all__ = [
+    "CheckpointingConfig",
+    "DataConfig",
+    "EvaluationConfig",
+    "ModelConfig",
+    "MonitoringConfig",
+    "TrainingConfig",
+]

src/config/_constants.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+Constants used throughout the codebase
+"""
+# Basic Training Constants used throughout the codebase
+VOCAB_SIZE = 50304
+MAX_SEQ_LEN = 2048
+BATCH_SIZE = 1024
+GRADIENT_ACCUMULATION_STEPS = 128
+# Directories used to store training runs, checkpoints, logs, and evaluation results
+RUNS_DIR = "runs"
+CHECKPOINTS_DIR = "checkpoints"
+LOGS_DIR = "logs"
+FABRIC_CHECKPOINT_DIR = "fabric_state"
+FABRIC_CHECKPOINT_FILENAME = "checkpoint.pt"
+LEARNING_DYNAMICS_DIR = "learning_dynamics"
+EVAL_RESULTS_DIR = "eval_results"

src/config/checkpointing_config.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+Checkpointing Config
+Specifies the hyperparameters for the checkpointing process; checkpointing is used to save
+the model and optimizer states, as well as the learning dynamics metrics.
+"""
+from dataclasses import dataclass, field
+from typing import List, Optional
+from ._constants import (
+    CHECKPOINTS_DIR,
+    EVAL_RESULTS_DIR,
+    FABRIC_CHECKPOINT_DIR,
+    FABRIC_CHECKPOINT_FILENAME,
+    LEARNING_DYNAMICS_DIR,
+    LOGS_DIR,
+    RUNS_DIR,
+)
+@dataclass
+class TrainingCheckpointingConfig:
+    # Automatically resume training from the most recent checkpoint
+    auto_resume: bool = True
+@dataclass
+class EvaluationCheckpointingConfig:
+    # Directory in which evaluation results are saved
+    eval_results_dir: str = EVAL_RESULTS_DIR
+@dataclass
+class LearningDynamicsCheckpointingConfig:
+    # Suffixes of the layers to compute learning dynamics for
+    layer_suffixes: List[str] = field(
+        default_factory=lambda: [
+            "attention.v_proj",
+            "attention.o_proj",
+            "swiglu.w_2",
+        ]
+    )
+    # Sequence index at which to extract hidden states; by default, we extract the hidden states
+    # at the last token of the sequence (-1)
+    sequence_idx: int = -1
+    # size of the sub-batch used for extracting learning dynamics states
+    batch_size: int = 8
+    # Path to evaluation dataset - used across learning dynamics checkpointing for consistency
+    # NOTE: set to None to disable extracting learning dynamics states for an eval_batch
+    # NOTE: this dataset should be small, ideally just a batch of additional data
+    eval_data: Optional[str] = "pico-lm/pretokenized-paloma-tinsy"
+@dataclass
+class HuggingFaceCheckpointingConfig:
+    # Should be in the format of <(username or organization name)>/<repo_name>, e.g. pico-lm/demo
+    repo_id: str = ""
+    # HuggingFace Collection Slug (specifies a tag for the run)
+    collection_slug: Optional[str] = None
+@dataclass
+class CheckpointingConfig:
+    # Assign a name to the run
+    run_name: Optional[str] = None
+    # Defining checkpointing directories
+    runs_dir: str = RUNS_DIR
+    checkpoints_dir: str = CHECKPOINTS_DIR
+    logs_dir: str = LOGS_DIR
+    fabric_checkpoint_dir: str = FABRIC_CHECKPOINT_DIR
+    fabric_checkpoint_filename: str = FABRIC_CHECKPOINT_FILENAME
+    learning_dynamics_dir: str = LEARNING_DYNAMICS_DIR
+    # How often to save checkpoints
+    save_every_n_steps: int = 1000
+    # Whether to save checkpoints to HuggingFace
+    save_to_hf: Optional[bool] = False
+    hf_checkpoint: HuggingFaceCheckpointingConfig = field(
+        default_factory=HuggingFaceCheckpointingConfig
+    )
+    training: TrainingCheckpointingConfig = field(
+        default_factory=TrainingCheckpointingConfig
+    )
+    evaluation: EvaluationCheckpointingConfig = field(
+        default_factory=EvaluationCheckpointingConfig
+    )
+    learning_dynamics: LearningDynamicsCheckpointingConfig = field(
+        default_factory=LearningDynamicsCheckpointingConfig
+    )

src/config/data_config.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Data Config
+Specifies the hyperparameters for the dataset, dataloader, and tokenizer.
+"""
+from dataclasses import dataclass, field
+from ._constants import BATCH_SIZE, VOCAB_SIZE
+@dataclass
+class DatasetConfig:
+    # Defines the HuggingFace name of a dataset
+    name: str = "pico-lm/pretokenized-dolma"
+@dataclass
+class DataLoaderConfig:
+    # NOTE: You should only change these values jointly with the training config; so that the
+    # sub-batch size is consistent with the gradient accumulation steps
+    batch_size: int = BATCH_SIZE
+@dataclass
+class TokenizerConfig:
+    # Specify a tokenizer to use
+    name: str = "allenai/OLMo-7B-0724-hf"
+    vocab_size: int = VOCAB_SIZE
+@dataclass
+class DataConfig:
+    dataset: DatasetConfig = field(default_factory=DatasetConfig)
+    dataloader: DataLoaderConfig = field(default_factory=DataLoaderConfig)
+    tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig)

src/config/evaluation_config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Evaluation Config
+Specifies the hyperparameters for the evaluation process, i.e. what metrics to compute, etc.
+"""
+from dataclasses import dataclass, field
+from typing import List, Optional
+from src.config._constants import MAX_SEQ_LEN
+@dataclass
+class PalomaEvaluationConfig:
+    dataset_name: str = "pico-lm/pretokenized-paloma-tinsy"
+    dataset_split: str = "val"
+    max_length: int = MAX_SEQ_LEN
+    batch_size: int = 16
+@dataclass
+class EvaluationConfig:
+    # Evaluation metrics to compute: by default, we compute the perplexity of the model on the paloma dataset
+    metrics: Optional[List[str]] = field(default_factory=lambda: ["paloma"])
+    # NOTE: Add other evaluation configs here
+    # Each evaluation metric should have its own config
+    paloma: PalomaEvaluationConfig = field(default_factory=PalomaEvaluationConfig)

src/config/model_config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Model Config
+Specifies the hyperparameters for the Pico model/model architecture.
+"""
+from dataclasses import dataclass
+from typing import Optional
+from ._constants import BATCH_SIZE, MAX_SEQ_LEN, VOCAB_SIZE
+@dataclass
+class ModelConfig:
+    model_type: str = "pico_decoder"
+    # Pico Decoder default hyperparameters
+    d_model: int = 768
+    n_layers: int = 12
+    vocab_size: int = VOCAB_SIZE
+    batch_size: int = BATCH_SIZE
+    max_seq_len: int = MAX_SEQ_LEN
+    attention_n_heads: int = 12
+    attention_n_kv_heads: Optional[int] = 4
+    activation_hidden_dim: int = 3072
+    norm_eps: float = 1e-6
+    position_emb_theta: float = 10000.0

src/config/monitoring_config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Monitoring Config
+Specifies the monitoring process, e.g. how to log metrics and keep track of training progress.
+"""
+from dataclasses import dataclass, field
+@dataclass
+class LoggingConfig:
+    log_level: str = "INFO"
+    log_every_n_steps: int = 100
+@dataclass
+class WandbConfig:
+    # configure logging to Weights and Biases
+    project: str = ""
+    entity: str = ""
+@dataclass
+class MonitoringConfig:
+    logging: LoggingConfig = field(default_factory=LoggingConfig)
+    # Weights and Biases
+    save_to_wandb: bool = False
+    wandb: WandbConfig = field(default_factory=WandbConfig)

src/config/training_config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+Training Config
+Specifies the hyperparameters for the training process, i.e. the optimizer, learning rate, etc.
+"""
+from dataclasses import dataclass, field
+from ._constants import GRADIENT_ACCUMULATION_STEPS
+@dataclass
+class FabricConfig:
+    # Configure nodes/devices for parallelised training
+    num_nodes: int = 1
+    num_devices: int = 1
+    precision: str = "bf16-mixed"
+    # Hardware accelerator to use, can be cpu/cuda/mps etc.
+    accelerator: str = "cuda"
+@dataclass
+class OptimizationConfig:
+    # Optimizer
+    optimizer: str = "adamw"
+    lr: float = 3e-4
+    # Learning Rate Scheduler
+    lr_scheduler: str = "linear_with_warmup"
+    lr_warmup_steps: int = 2500
+    # Define number of gradient accumulation steps
+    gradient_accumulation_steps: int = GRADIENT_ACCUMULATION_STEPS
+@dataclass
+class TrainingConfig:
+    fabric: FabricConfig = field(default_factory=FabricConfig)
+    optimization: OptimizationConfig = field(default_factory=OptimizationConfig)
+    max_steps: int = 200_000

src/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Pico Evaluation Package
+This package implements the evaluation pipeline for the Pico language model. It provides
+functionality to evaluate model performance using various metrics and handles the complete
+evaluation workflow.
+We recommend that each evaluation metric should have its own config, and should be
+implemented as a module in the `evaluation/tasks` directory that exposes a `run_<metric_name>` function.
+NOTE: Out of the box we only support Paloma, but the structure is designed to be flexible and
+you are meant to add whatever metrics you want. One of the main reasons we store out
+the model in the HuggingFace format is so that its easy to use third-party evaluation
+libraries/frameworks.
+"""
+import os
+import torch
+from lightning.fabric import Fabric
+from torch import nn
+from src.config import CheckpointingConfig, EvaluationConfig
+from .tasks.paloma import run_paloma_evaluation
+def run_evaluation(
+    evaluation_config: EvaluationConfig,
+    checkpointing_config: CheckpointingConfig,
+    fabric: Fabric,
+    model: nn.Module,
+) -> None:
+    """Run model evaluation using specified metrics in `evaluation_config`.
+    This function orchestrates the complete evaluation pipeline by:
+    1. Resolving the model checkpoint path (either specified or latest) to load the model from;
+        during training, this is the path to the latest checkpoint in the run directory.
+    2. Iterating over each evaluation metric, and running the corresponding evaluation function.
+        NOTE: we suggest you follow the pattern of the Paloma evaluation function, and implement
+        your own evaluation function for each metric in the `evaluation/tasks` directory.
+    3. Aggregating results across all metrics in a dictionary, and returning it.
+    Args:
+        evaluation_config (EvaluationConfig): Configuration object containing:
+            - metrics (List[str]): Metrics to evaluate; each metric should have its
+                own config. Currently supported: ["paloma"];
+            - paloma (PalomaConfig): Configuration for Paloma evaluation
+                - max_length (int): Maximum sequence length
+                - limit_eval_examples (Optional[int]): Number of examples to evaluate
+        checkpointing_config (CheckpointingConfig): Configuration object containing:
+        fabric (Fabric): Lightning Fabric instance
+        model (nn.Module): Original model instance
+    Returns:
+        Dict[str, float]: Dictionary mapping metric names to their values
+            Example: {"paloma": 3.45}
+    Raises:
+        ValueError: If an unsupported evaluation metric is requested
+    Example:
+        results = run_evaluation(
+            EvaluationConfig(
+                run_name="experiment_1",
+                metrics=["paloma"],
+                paloma=PalomaConfig(max_length=2048, batch_size=16)
+            )
+        )
+    """
+    fabric.barrier()
+    model.to("cpu")  # Offloading model to CPU
+    evaluation_results = {}
+    # NOTE: Evaluation is only run on first processes to enable third-party evaluation libraries
+    # to determine how to handle distributed evaluation.
+    if fabric.global_rank == 0:
+        run_name = checkpointing_config.run_name
+        model_path = f"{os.getcwd()}/{checkpointing_config.runs_dir}/{run_name}/{checkpointing_config.checkpoints_dir}/latest"
+        os.makedirs(model_path, exist_ok=True)
+        for metric in evaluation_config.metrics:
+            # NOTE: add your own metrics here
+            if metric == "paloma":
+                evaluation_result = run_paloma_evaluation(
+                    model_path, evaluation_config.paloma
+                )
+            else:
+                raise ValueError(f"Metric {metric} not supported")
+            evaluation_results[metric] = evaluation_result
+    torch.cuda.empty_cache()
+    fabric.barrier()
+    model.to(fabric.device)
+    return evaluation_results

src/evaluation/tasks/paloma.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Paloma is a comprehensive evaluation benchmark for large language models (LLMs) that focuses
+on measuring perplexity across diverse text domains.
+To evaluate on Paloma, we use the huggingface evaluation framework.
+For more details, see: https://huggingface.co/datasets/allenai/paloma
+"""
+import evaluate
+from datasets import load_dataset
+from datasets.utils.logging import disable_progress_bar, enable_progress_bar
+from src.config.evaluation_config import PalomaEvaluationConfig
+def run_paloma_evaluation(
+    model_path: str,
+    paloma_config: PalomaEvaluationConfig,
+) -> None:
+    """Run Perplexity evaluation on the Paloma evaluation dataset.
+    We use the HuggingFace evaluate library to load in and compute the perplexity metric.
+    Args:
+        model_path (str): Path to the model checkpoint to be evaluated
+        paloma_config (PalomaEvaluationConfig): Configuration for Paloma evaluation
+    """
+    disable_progress_bar()
+    # load custom evaluation space, see https://huggingface.co/spaces/pico-lm/perplexity
+    perplexity = evaluate.load("pico-lm/perplexity")
+    dataset = load_dataset(
+        paloma_config.dataset_name, split=paloma_config.dataset_split
+    )["text"]
+    # compute perplexity score on Paloma dataset
+    perplexity_result = perplexity.compute(
+        model_id=model_path,
+        predictions=dataset,
+        add_start_token=False,
+        max_length=paloma_config.max_length,
+        batch_size=paloma_config.batch_size,
+        trust_remote_code=True,
+    )
+    mean_perplexity = perplexity_result["mean_perplexity"]
+    enable_progress_bar()
+    return mean_perplexity

src/model/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+Model Package
+This Package contains Pico models (currently only the Pico Decoder). We plan to implement other
+architectures in the future.
+If you have other models you'd like to implement, we recommend you add modules to this package.
+"""
+from .pico_decoder import PicoDecoder
+__all__ = ["PicoDecoder"]

src/model/pico_decoder.py ADDED Viewed

	@@ -0,0 +1,911 @@

+"""
+Pico Decoder: A Lightweight Causal Transformer Language Model
+Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
+Everything is written with a modular design for easy modification and experimentation.
+Key features:
+- RMSNorm for layer normalization
+- Rotary Positional Embeddings (RoPE)
+- Multi-head attention with KV-cache support
+- SwiGLU activation function
+- Residual connections throughout
+- KV-cache for faster autoregressive generation
+References:
+    - RoPE: https://arxiv.org/abs/2104.09864
+    - SwiGLU: https://arxiv.org/abs/2002.05202
+    - LLAMA: https://arxiv.org/abs/2302.13971
+Adapted from:
+    - OLMO: https://github.com/allenai/OLMo
+    - LLAMA: https://github.com/meta/llama
+"""
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Handle PyTorch version compatibility for attention backend
+try:
+    from torch.nn.attention import SDPBackend, sdpa_kernel
+    HAS_TORCH_ATTENTION = True
+except ImportError:
+    # Fallback for older PyTorch versions
+    HAS_TORCH_ATTENTION = False
+    SDPBackend = None
+    sdpa_kernel = None
+from transformers import GenerationMixin, PretrainedConfig, PreTrainedModel
+from transformers.generation import GenerationConfig
+from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
+try:
+    if TYPE_CHECKING:
+        # We need to do this to avoid importing these when creating the HF-compatible models
+        from src.config import ModelConfig
+except ImportError:
+    pass
+########################################################
+#
+# Layer Normalization
+#
+########################################################
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+    A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
+    resulting in improved stability and performance.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
+            - config.norm_eps: Small constant for numerical stability
+            - config.d_model: Model dimension for the weight parameter
+    References:
+        https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.eps = config.norm_eps
+        self.weight = nn.Parameter(torch.ones(config.d_model))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalizes the input tensor by its RMS value.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS normalization to the input tensor and scales it by the weight parameter.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+########################################################
+#
+# Positional Embedding
+#
+########################################################
+class RoPE(nn.Module):
+    """Rotary Positional Embeddings (RoPE).
+    Implements position-dependent rotation of keys and queries in attention mechanism,
+    allowing better modeling of relative positions in sequences. Uses complex number
+    operations for efficient rotation.
+    Args:
+        config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
+            - config.position_emb_theta: Base for frequency computation
+            - config.d_model: Model dimension
+            - config.attention_n_heads: Number of attention heads
+            - config.max_seq_len: Maximum sequence length
+    References:
+        https://arxiv.org/abs/2104.09864
+    """
+    _freqs_cis_tensor: torch.Tensor | None = None
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        self.theta = config.position_emb_theta
+        self.dim = config.d_model // config.attention_n_heads
+        max_seq_len = config.max_seq_len
+        # only gets set once, and then reused for all RoPE instances
+        if RoPE._freqs_cis_tensor is None:
+            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
+                max_seq_len, self.theta, self.dim
+            )
+        # register _freqs_cis buffer
+        # can be easily recomputed so persistent=False
+        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
+    @classmethod
+    def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
+        """Setup Frequency Tensor for RoPE Embeddings
+        Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
+        Note other implementations will use cos and sin directly, but using the complex
+        number representation is (probably) more efficient:
+            e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
+        """
+        _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        positions = torch.arange(seq_len)
+        freqs = torch.outer(positions, _freqs)
+        return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    def get_freqs_cis(
+        self, input_shape: torch.Size, start_pos: int, end_pos: int
+    ) -> torch.Tensor:
+        """Reshape Frequency Tensor for RoPE Embeddings
+        Makes the frequency tensor broadcastable with the input tensor.
+        """
+        _freqs_cis = self._freqs_cis[start_pos:end_pos]
+        ndim = len(input_shape)
+        assert 0 <= 1 < ndim
+        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
+        # TODO: Check whether this is correct (might be able to remove this)
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
+        return _freqs_cis.view(*shape)
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        start_pos: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply RoPE Embeddings to Queries and Keys
+        Applies the rotary positional embeddings to the input tensors via complex num multiplication
+        NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
+        """
+        queries_ = torch.view_as_complex(
+            queries.float().reshape(*queries.shape[:-1], -1, 2)
+        )
+        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
+        input_shape = (
+            queries_.shape
+        )  # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
+        freqs_start_pos = start_pos
+        freqs_end_pos = freqs_start_pos + queries_.shape[1]
+        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
+        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
+        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
+        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
+########################################################
+#
+# Attention
+#
+########################################################
+class Attention(nn.Module):
+    """Multi-head Attention with Group Query Attention support.
+    Implements scaled dot-product attention and supports:
+    - Grouped Query Attention (GQA)
+    - Key-Value caching for efficient inference
+    - RoPE integration
+    Args:
+        config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
+            - config.attention_n_heads: Number of attention heads
+            - config.attention_n_kv_heads: Number of key/value heads
+            - config.d_model: Model dimension
+            - config.batch_size: Maximum batch size
+            - config.max_seq_len: Maximum sequence length
+    Shape:
+        - Input: (batch_size, seq_len, d_model)
+        - Output: (batch_size, seq_len, d_model)
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.n_heads = config.attention_n_heads
+        self.n_kv_heads = config.attention_n_kv_heads
+        self.batch_size = config.batch_size
+        self.max_seq_len = config.max_seq_len
+        d_model = config.d_model
+        self.head_dim = d_model // self.n_heads
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
+        self.rope = RoPE(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass for the attention mechanism.
+        Computes queries, keys, and values for the attention mechanism. Applies rotary positional
+        embeddings to the queries and keys, and then computes attention scores and outputs.
+        For an introduction to the attention mechanism, see:
+        https://arxiv.org/abs/1706.03762
+        A few things to note:
+        - The past_key_values is used to implement the KV cache, which is used to speed up
+          generation by caching the KV pairs from previous forward passes. This is useful when doing
+          tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+          modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+          its own KV cache - this KV cache is implemented as a tuple.
+        """
+        bsz, seq_len, _ = input.shape
+        _queries, _keys, _values = (
+            self.q_proj(input),
+            self.k_proj(input),
+            self.v_proj(input),
+        )
+        # Reshaping for multi-head attention
+        queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
+        keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        # The start position is used to apply the RoPE embeddings to only the new tokens
+        # when using the kv_cache in the attention mechanism.
+        # We want to start from the last position in the cache.
+        start_pos = 0
+        if past_key_values is not None and past_key_values[0] is not None:
+            start_pos = past_key_values[0].shape[1]
+        # apply rotary positional embeddings
+        queries, keys = self.rope(queries, keys, start_pos)
+        if (
+            past_key_values is not None
+            and past_key_values[0] is not None
+            and past_key_values[1] is not None
+        ):
+            keys = torch.cat([past_key_values[0], keys], dim=1)
+            values = torch.cat([past_key_values[1], values], dim=1)
+        if use_cache:
+            cached_keys = keys
+            cached_values = values
+        else:
+            cached_keys = None
+            cached_values = None
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        apply_gqa = self.n_rep > 1
+        if apply_gqa and queries.device.type == "mps":
+            # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
+            # outside of the kernel to get the same effect.
+            # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+            keys = keys.repeat_interleave(self.n_rep, dim=-3)
+            values = values.repeat_interleave(self.n_rep, dim=-3)
+            apply_gqa = False
+        if HAS_TORCH_ATTENTION:
+            backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
+            with sdpa_kernel(backends=backends):
+                attn_output = F.scaled_dot_product_attention(
+                    queries.contiguous(),
+                    keys.contiguous(),
+                    values.contiguous(),
+                    attn_mask=mask.to(queries.dtype) if mask is not None else None,
+                    enable_gqa=apply_gqa,
+                )
+        else:
+            # Fallback for older PyTorch versions - use default backend
+            attn_output = F.scaled_dot_product_attention(
+                queries.contiguous(),
+                keys.contiguous(),
+                values.contiguous(),
+                attn_mask=mask.to(queries.dtype) if mask is not None else None,
+                enable_gqa=apply_gqa,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        output = self.o_proj(attn_output)
+        return output, (cached_keys, cached_values)
+########################################################
+#
+# SwiGLU (Combines MLP and Activation)
+#
+########################################################
+class SwiGLU(nn.Module):
+    """SwiGLU Activation Function with Linear Projections.
+    Implements the SwiGLU activation function combined with linear transformations,
+    serving as the feed-forward network in transformer blocks.
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
+            - config.d_model: Model dimension
+            - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
+    References:
+        https://arxiv.org/abs/2002.05202
+    """
+    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
+        super().__init__()
+        model_dim = config.d_model
+        act_hidden_dim = config.activation_hidden_dim  # usually 4 * d_model
+        self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
+        self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
+########################################################
+#
+# PicoDecoderBlock
+#
+########################################################
+class PicoDecoderBlock(nn.Module):
+    """Single Transformer Block with Attention and Feed-forward layers.
+    Implements a standard transformer block with:
+    - Multi-head attention with normalization and residual connection
+    - SwiGLU feed-forward network with normalization and residual connection
+    Args:
+        config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
+            a HuggingFace PicoDecoderHFConfig
+    """
+    def __init__(
+        self,
+        config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.attention = Attention(config)
+        self.swiglu = SwiGLU(config)
+        self.attention_norm = RMSNorm(config)
+        self.swiglu_norm = RMSNorm(config)
+    def forward(
+        self,
+        input: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        attention_output, cached_key_values = self.attention(
+            self.attention_norm(input),
+            mask=mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        # NOTE: cached_key_values is None if use_cache is False
+        h = input + attention_output
+        out = h + self.swiglu(self.swiglu_norm(h))
+        return out, cached_key_values
+########################################################
+#
+# Pico Decoder (Causal Transformer Model)
+#
+########################################################
+class PicoDecoder(nn.Module):
+    """
+    Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
+    single autoregressive model.
+    For more information on the model, see the classes for the modules that make up the model.
+    """
+    def __init__(
+        self,
+        model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
+    ):
+        super().__init__()
+        self.config = model_config
+        self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
+        self.layers = nn.ModuleList(
+            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
+        )
+        self.output_norm = RMSNorm(self.config)
+        self.de_embedding_proj = nn.Linear(
+            self.config.d_model, self.config.vocab_size, bias=False
+        )
+    def convert_to_hf_model(self) -> "PicoDecoderHF":
+        """Convert the Lightning model to a HuggingFace model."""
+        # Create HF config without fabric-specific settings
+        hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
+        # Create new HF model
+        hf_model = PicoDecoderHF(hf_config)
+        # Copy state dict, excluding fabric-specific keys
+        hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
+        return hf_model
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
+        """
+        This is the forward pass for the entire Pico model. It boils down to:
+        - Embedding the input ids
+        - Creating a causal mask
+        - Processing through the pico layers
+        - Projecting the output to logits
+        NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
+        generation by caching the KV pairs from previous forward passes. This is useful when doing
+        tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
+        modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
+        its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
+        KV caches (so a tuple of tuples).
+        """
+        seq_len = input_ids.shape[-1]
+        h = self.embedding_proj(input_ids)
+        # Calculate start position from past cached KV pairs. Remember that each layer has its
+        # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
+        # correct layer and then for either the keys or values.
+        start_pos = 0
+        if (
+            past_key_values is not None
+            and past_key_values[0] is not None
+            and past_key_values[0][0] is not None
+        ):
+            start_pos = past_key_values[0][0].shape[1]
+        # Create causal mask for current sequence
+        mask = None
+        if seq_len > 1:
+            mask = torch.full((seq_len, seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            # If using KV cache, extend mask to cover cached sequence length
+            if past_key_values is not None:
+                # Add zeros for cached tokens (we can attend to all of them)
+                mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
+            mask = mask.to(h.device)
+        # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
+        #       in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
+        cached_key_values = () if use_cache else None
+        # Process through transformer blocks
+        for idx, layer in enumerate(self.layers):
+            layer_past_key_values = None
+            if past_key_values is not None:
+                try:
+                    # Handle both tuple-based cache and HuggingFace cache objects
+                    if hasattr(past_key_values, "__getitem__") and idx < len(
+                        past_key_values
+                    ):
+                        layer_past_key_values = past_key_values[idx]
+                except (KeyError, IndexError, TypeError):
+                    # If we can't access the cache properly, just skip it
+                    layer_past_key_values = None
+            h, layer_cached_key_values = layer(
+                h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
+            )
+            if use_cache:
+                cached_key_values += (layer_cached_key_values,)
+        # Final norm and projection
+        h = self.output_norm(h)
+        logits = self.de_embedding_proj(h).float()
+        return logits, cached_key_values
+########################################################
+#
+# HuggingFace Wrapper for the Pico Decoder model.
+#
+########################################################
+class PicoDecoderHFConfig(PretrainedConfig):
+    """Config class for the Pico Decoder HuggingFace wrapper."""
+    model_type = "pico_decoder"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
+        """
+        Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
+        this is because with some kwargs special handling is required and can make this class
+        brittle.
+        """
+        pico_config = cls(**config_dict)
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        unused_kwargs = {
+            key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
+        }
+        if return_unused_kwargs:
+            return pico_config, unused_kwargs
+        return pico_config
+    @classmethod
+    def from_dataclass(cls, model_config: "ModelConfig"):
+        """Initialise from our custom config dataclass."""
+        return cls.from_dict(asdict(model_config))
+class PicoDecoderHF(PreTrainedModel, GenerationMixin):
+    """
+    HuggingFace wrapper for the Pico model with generation support.
+    Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
+    wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
+    Pico model as well as the model wrapped in this HuggingFace class.
+    This also lets you do cool things like:
+    `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, "max_position_embeddings"):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, "vocab_size"):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """HuggingFace forward pass wrapper.
+        Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
+        Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
+        """
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Prepare inputs for generation.
+        Args:
+            input_ids: Input token IDs
+            past_key_values: Cached key-value pairs from previous forward passes
+            attention_mask: Attention mask for the input
+            **kwargs: Additional arguments
+        Returns:
+            Dictionary containing prepared inputs
+        """
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(
+            new_num_tokens, old_embeddings.embedding_dim
+        )
+        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
+            old_embeddings.weight.data
+        )
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+# Register for auto classes
+PicoDecoderHFConfig.register_for_auto_class()
+PicoDecoderHF.register_for_auto_class("AutoModel")
+PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")
+########################################################
+#
+# New PicoDecoderForCausalLM class for generation support
+#
+########################################################
+class PicoDecoderForCausalLM(PreTrainedModel, GenerationMixin):
+    """
+    PicoDecoderForCausalLM: A HuggingFace-compatible model that properly supports generation.
+    This class is designed to work with existing checkpoints and provides full generation support.
+    It inherits from the right base classes that HuggingFace expects for text generation.
+    """
+    config_class = PicoDecoderHFConfig
+    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
+    main_input_name = "input_ids"
+    def __init__(self, config: PicoDecoderHFConfig):
+        super().__init__(config)
+        self.pico_decoder = PicoDecoder(config)
+        # Initialize generation config with defaults
+        self.generation_config = GenerationConfig()
+        # Set some reasonable defaults for the model
+        if hasattr(config, "max_position_embeddings"):
+            self.generation_config.max_length = config.max_position_embeddings
+        if hasattr(config, "vocab_size"):
+            self.generation_config.vocab_size = config.vocab_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
+        """Forward pass for text generation."""
+        logits, past_key_values = self.pico_decoder(
+            input_ids, past_key_values, use_cache
+        )
+        if use_cache:
+            return CausalLMOutputWithPast(
+                logits=logits,
+                past_key_values=past_key_values,
+            )
+        else:
+            return CausalLMOutput(
+                logits=logits,
+            )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Prepare inputs for generation."""
+        # If we have past_key_values, we only need the last token
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.pico_decoder.embedding_proj
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.pico_decoder.embedding_proj = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer."""
+        return self.pico_decoder.de_embedding_proj
+    def set_output_embeddings(self, value):
+        """Set the output embeddings layer."""
+        self.pico_decoder.de_embedding_proj = value
+    def get_lm_head(self):
+        """Get the language model head."""
+        return self.pico_decoder.de_embedding_proj
+    def can_generate(self) -> bool:
+        """Check if the model can generate text."""
+        return True
+    @property
+    def is_encoder_decoder(self) -> bool:
+        """Check if the model is an encoder-decoder model."""
+        return False
+    @property
+    def can_use_cache(self) -> bool:
+        """Check if the model can use KV cache."""
+        return True
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """Resize token embeddings."""
+        old_embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            new_num_tokens = old_embeddings.num_embeddings
+        new_embeddings = torch.nn.Embedding(
+            new_num_tokens, old_embeddings.embedding_dim
+        )
+        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
+            old_embeddings.weight.data
+        )
+        self.pico_decoder.embedding_proj = new_embeddings
+        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
+            old_embeddings.embedding_dim, new_num_tokens, bias=False
+        )
+        return new_embeddings
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load a pretrained model from a checkpoint.
+        This method handles loading from both the old PicoDecoderHF format and the new format.
+        """
+        # First try to load with the new class
+        try:
+            return super().from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+        except Exception as e:
+            print(f"Failed to load with new class: {e}")
+            print("Attempting to load with legacy class and convert...")
+            # Try to load with the old class and convert
+            try:
+                from transformers import AutoModel
+                old_model = AutoModel.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    *model_args,
+                    **kwargs,
+                )
+                # Create new model instance
+                new_model = cls(old_model.config)
+                # Copy state dict
+                new_model.load_state_dict(old_model.state_dict(), strict=False)
+                return new_model
+            except Exception as e2:
+                print(f"Failed to convert from legacy format: {e2}")
+                raise e
+# Register the new class
+PicoDecoderForCausalLM.register_for_auto_class("AutoModelForCausalLM")

src/training/trainer.py ADDED Viewed

	@@ -0,0 +1,753 @@

+"""
+Pico Language Model Trainer
+This Trainer implements a minimalistic end-to-end training pipeline of the Pico language model with
+distributed training support via Lightning Fabric. It provides a modular and configurable training
+pipeline with the features:
+    - Configuration Management: YAML-based configuration for all aspects of training
+    - Distributed Training: Multi-GPU support via Lightning Fabric
+    - Checkpointing: Regular model saving and training state recovery
+    - Evaluation: Periodic model evaluation on validation datasets
+    - Logging: Comprehensive metric tracking and experiment monitoring
+    - Optimization: Support for gradient accumulation, clipping, and LR scheduling
+"""
+import logging
+import os
+import platform
+from typing import Any, Dict
+import lightning as L
+import psutil
+import torch
+import torch.nn.functional as F
+import yaml
+from datasets import Dataset, load_dataset
+from lightning.fabric.utilities.rank_zero import rank_zero_only
+from src.checkpointing import (
+    compute_learning_dynamics_states,
+    load_checkpoint,
+    save_checkpoint,
+    save_evaluation_results,
+    save_learning_dynamics_states,
+)
+from src.evaluation import run_evaluation
+from src.training.utils import (
+    initialize_configuration,
+    initialize_dataloader,
+    initialize_dataset,
+    initialize_fabric,
+    initialize_hf_checkpointing,
+    initialize_logging,
+    initialize_lr_scheduler,
+    initialize_model,
+    initialize_optimizer,
+    initialize_run_dir,
+    initialize_tokenizer,
+    initialize_wandb,
+)
+from src.training.utils.logging import pretty_print_yaml_config
+class Trainer:
+    def __init__(self, config_path: str):
+        """
+        Initializes the Trainer class. This Trainer class implements a `train` method, which is the
+        main entry point for training the Pico model. Before calling `train`, the Trainer class
+        initializes the following:
+            - Configuration loading and validation
+            - Model, optimizer, and dataset setup
+            - Logging and experiment tracking setup
+            - Checkpoint management
+        Args:
+            config_path (str): Path to the YAML configuration file containing any overrides.
+        """
+        ########################################################
+        #
+        # Basic Initialization of Configs, Fabric, Model, Optimizer, etc.
+        #
+        ########################################################
+        # Setup Config
+        self.configs = initialize_configuration(config_path)
+        # Setup Run Directory (i.e. where we store checkpoints, logs, etc.)
+        initialize_run_dir(checkpointing_config=self.configs["checkpointing"])
+        # Setup Logger
+        if self.configs["monitoring"].save_to_wandb:
+            wandb_logger = initialize_wandb(
+                monitoring_config=self.configs["monitoring"],
+                checkpointing_config=self.configs["checkpointing"],
+            )
+        else:
+            wandb_logger = None
+        # Setup Fabric
+        self.fabric = initialize_fabric(
+            training_config=self.configs["training"],
+            wandb_logger=wandb_logger,
+        )
+        L.seed_everything(42, verbose=False)
+        # Optimize for Tensor Cores on RTX 5090
+        if self.fabric.device.type == "cuda":
+            torch.set_float32_matmul_precision(
+                "high"
+            )  # Best performance for Tensor Cores
+            print(
+                "Enabled Tensor Core optimization: torch.set_float32_matmul_precision('high')"
+            )
+        # Set up logging
+        self.logger = initialize_logging(
+            monitoring_config=self.configs["monitoring"],
+            checkpointing_config=self.configs["checkpointing"],
+            fabric=self.fabric,
+        )
+        # Setup Model, Optimizer, and Dataloaders
+        self.model = initialize_model(model_config=self.configs["model"])
+        self.optimizer = initialize_optimizer(
+            training_config=self.configs["training"], model=self.model
+        )
+        self.lr_scheduler = initialize_lr_scheduler(
+            training_config=self.configs["training"], optimizer=self.optimizer
+        )
+        # Wrap model and optimizer with Fabric
+        self.model, self.optimizer = self.fabric.setup(self.model, self.optimizer)
+        # Setup HuggingFace Checkpointing
+        if self.configs["checkpointing"].save_to_hf:
+            initialize_hf_checkpointing(
+                checkpointing_config=self.configs["checkpointing"], fabric=self.fabric
+            )
+        ########################################################
+        #
+        # Boilerplate to deal with loading/resuming from checkpoints
+        #
+        ########################################################
+        self.should_load_checkpoint = self.configs["checkpointing"].training.auto_resume
+        # Possibly load a checkpoint
+        if self.should_load_checkpoint:
+            resume_checkpoint = load_checkpoint(
+                checkpointing_config=self.configs["checkpointing"],
+                checkpoint_step="latest",
+                fabric=self.fabric,
+                model=self.model,
+                optimizer=self.optimizer,
+                lr_scheduler=self.lr_scheduler,
+            )
+            if resume_checkpoint:
+                (
+                    self.model,
+                    self.optimizer,
+                    self.lr_scheduler,
+                    self.initial_batch_step,
+                ) = resume_checkpoint
+            else:
+                self.initial_batch_step = 0
+        else:
+            self.initial_batch_step = 0
+        ########################################################
+        #
+        # Initialization of Dataset & DataLoader (possibly fast-forwarding to correct batch)
+        #
+        ########################################################
+        self.train_dataset, fast_forward_steps = initialize_dataset(
+            data_config=self.configs["data"],
+            fabric=self.fabric,
+            initial_batch_step=self.initial_batch_step,
+            return_fast_forward_steps=True,
+        )
+        self.train_dataloader = initialize_dataloader(
+            data_config=self.configs["data"],
+            training_config=self.configs["training"],
+            fabric=self.fabric,
+            dataset=self.train_dataset,
+        )
+        self.train_dataloader = self.fabric.setup_dataloaders(
+            self.train_dataloader, use_distributed_sampler=False
+        )
+        self.tokenizer = initialize_tokenizer(data_config=self.configs["data"])
+        # NOTE: We may need to fast-forward the iterator to the correct step so that we can
+        # continue from the correct batch of data we would have seen had training not
+        # previously stopped.
+        train_iterator = iter(self.train_dataloader)
+        if fast_forward_steps > 0:
+            fast_forward_sub_steps = (
+                fast_forward_steps
+                * self.configs["training"].optimization.gradient_accumulation_steps
+            )
+            for _ in range(fast_forward_sub_steps):
+                next(train_iterator)
+        self.train_iterator = train_iterator
+        # NOTE: Sychronizing processes after fast-forwarding iterator
+        self.fabric.barrier()
+        ########################################################
+        #
+        # Helper flags used during training for checkpointing and evaluation
+        #
+        ########################################################
+        # Helper flag to determine if we should evaluate the model
+        self.should_evaluate = (
+            self.configs["evaluation"].metrics is not None
+            and len(self.configs["evaluation"].metrics) > 0
+        )
+        self.should_compute_learning_dynamics = (
+            self.configs["checkpointing"].learning_dynamics.layer_suffixes is not None
+            and len(self.configs["checkpointing"].learning_dynamics.layer_suffixes) > 0
+        )
+        if self.should_compute_learning_dynamics:
+            if self.configs["checkpointing"].learning_dynamics.eval_data is not None:
+                self.learning_dynamics_eval_dataset = load_dataset(
+                    self.configs["checkpointing"].learning_dynamics.eval_data,
+                    split="val",
+                )
+            else:
+                self.learning_dynamics_eval_dataset = None
+    def train(self) -> None:
+        """Execute the main training pipeline.
+        This method orchestrates the complete training process by:
+        1. Creating an initial checkpoint to save the starting state and evaluate the model as a
+            baseline
+        2. Running the main training loop via `_training_loop`
+        3. Handling final checkpointing and evaluation
+        The training progress is tracked through checkpoints and evaluations
+        at intervals specified in the configuration.
+        """
+        ########################################################
+        #
+        # Initial Checkpointing and Evaluation
+        #
+        ########################################################
+        # Save Initial Checkpoint -- If the checkpoint already exists, this performs a no-op
+        save_checkpoint(
+            configs=self.configs,
+            checkpoint_step=self.initial_batch_step,
+            fabric=self.fabric,
+            model=self.model,
+            optimizer=self.optimizer,
+            lr_scheduler=self.lr_scheduler,
+            tokenizer=self.tokenizer,
+        )
+        # Save Initial Evaluation Results
+        if self.should_evaluate:
+            if self.initial_batch_step == 0:
+                evaluation_results = run_evaluation(
+                    evaluation_config=self.configs["evaluation"],
+                    checkpointing_config=self.configs["checkpointing"],
+                    fabric=self.fabric,
+                    model=self.model,
+                )
+                self._log_evaluation_results(
+                    evaluation_results, self.initial_batch_step
+                )
+                save_evaluation_results(
+                    checkpointing_config=self.configs["checkpointing"],
+                    fabric=self.fabric,
+                    evaluation_results=evaluation_results,
+                    checkpoint_step=self.initial_batch_step,
+                )
+            else:
+                # NOTE: If the run crashed while evaluating, we need to restart the evaluation
+                eval_results_path = os.path.join(
+                    self.configs["checkpointing"].evaluation.eval_results_dir,
+                    f"step_{self.initial_batch_step}.json",
+                )
+                if not os.path.exists(eval_results_path):
+                    evaluation_results = run_evaluation(
+                        evaluation_config=self.configs["evaluation"],
+                        checkpointing_config=self.configs["checkpointing"],
+                        fabric=self.fabric,
+                        model=self.model,
+                    )
+                    self._log_evaluation_results(
+                        evaluation_results, self.initial_batch_step
+                    )
+                    save_evaluation_results(
+                        checkpointing_config=self.configs["checkpointing"],
+                        fabric=self.fabric,
+                        evaluation_results=evaluation_results,
+                        checkpoint_step=self.initial_batch_step,
+                    )
+        ########################################################
+        #
+        # Main Training Loop (see `_training_loop` for details)
+        #
+        ########################################################
+        if self.initial_batch_step < self.configs["training"].max_steps:
+            self._log_training_configuration()
+            final_step = self._training_loop()
+        else:
+            final_step = self.initial_batch_step
+        ########################################################
+        #
+        # Final Checkpointing and Evaluation
+        #
+        ########################################################
+        # Save Learning Dynamics States
+        if self.should_compute_learning_dynamics:
+            if self.learning_dynamics_eval_dataset is not None:
+                self.log(f"Step {final_step} -- 📈 Saving Learning Dynamics")
+                learning_dynamics_val_states = compute_learning_dynamics_states(
+                    checkpointing_config=self.configs["checkpointing"],
+                    fabric=self.fabric,
+                    model=self.model,
+                    dataset=self.learning_dynamics_eval_dataset,
+                    compute_gradients=True,
+                )
+                save_learning_dynamics_states(
+                    checkpointing_config=self.configs["checkpointing"],
+                    fabric=self.fabric,
+                    learning_dynamics_states=learning_dynamics_val_states,
+                    checkpoint_step=final_step,
+                    prefix="val",
+                )
+        # Handle checkpointing and final evaluation
+        if final_step % self.configs["checkpointing"].save_every_n_steps != 0:
+            self.log(f"Step {final_step} -- 💾 Saving Final Checkpoint")
+            save_checkpoint(
+                configs=self.configs,
+                checkpoint_step=final_step,
+                fabric=self.fabric,
+                model=self.model,
+                optimizer=self.optimizer,
+                lr_scheduler=self.lr_scheduler,
+                tokenizer=self.tokenizer,
+            )
+            # Final evaluation
+            if self.should_evaluate:
+                evaluation_results = run_evaluation(
+                    evaluation_config=self.configs["evaluation"],
+                    checkpointing_config=self.configs["checkpointing"],
+                    fabric=self.fabric,
+                    model=self.model,
+                )
+                self._log_evaluation_results(evaluation_results, final_step)
+                save_evaluation_results(
+                    checkpointing_config=self.configs["checkpointing"],
+                    checkpoint_step=final_step,
+                    fabric=self.fabric,
+                    evaluation_results=evaluation_results,
+                )
+        self.log(f"🎉 Training complete! Final step: {final_step}")
+        if final_step < self.configs["training"].max_steps:
+            self.log(
+                f"\t Note: Training stopped before max steps ({self.configs['training'].max_steps})",
+                level=logging.WARNING,
+            )
+        # Cleanup distributed training
+        self.fabric.barrier()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            if torch.distributed.is_initialized():
+                torch.distributed.destroy_process_group()
+            del self.train_dataloader  # NOTE: shutting down worker nodes
+        self.fabric.barrier()
+    def _training_loop(self) -> int:
+        """Execute the main training loop.
+        This method orchestrates the core training loop and includes the following features:
+            - Gradient accumulation
+            - Gradient clipping
+            - Periodic model evaluation and checkpointing
+            - Learning Dynamics Checkpointing
+            - Learning rate scheduling
+            - Logging of training metrics including loss and learning rate
+            - Handling of infinite/NaN losses
+        Returns:
+            int: The final step count reached during training.
+                NOTE: A complete training run should match the configured max_steps.
+        """
+        # Setup training loop variables
+        batch_step = self.initial_batch_step
+        # NOTE: these are used to compute the average loss over a training interval.
+        # This is more accurate than using the loss at the end of the interval.
+        interval_loss = torch.tensor(0.0, device=self.fabric.device)
+        interval_steps = torch.tensor(0, device=self.fabric.device)
+        interval_inf_or_nan_count = torch.tensor(0, device=self.fabric.device)
+        if self.should_compute_learning_dynamics:
+            # NOTE: we basically re-construct the full batch here so that we can compute learning dynamics
+            training_batch = {"input_ids": []}
+        # NOTE: determine what sub-batch we should start from
+        initial_sub_batch_step = (
+            batch_step
+            * self.configs["training"].optimization.gradient_accumulation_steps
+        )
+        ###############################################################
+        #
+        # Core loop starts here
+        # NOTE: the ratio between sub_batch_step and batch_step
+        # is the configured number of gradient_accumulation_steps
+        # i.e. with 32 configured gradient accumulation steps,
+        # there are 32 sub_batch_steps for each batch_step
+        #
+        ###############################################################
+        for sub_batch_step, sub_batch in enumerate(
+            self.train_iterator, start=initial_sub_batch_step
+        ):
+            # NOTE: We want to store the entire training batch whenever we are computing learning dynamics
+            # and we are at a checkpointing step.
+            should_store_training_batch = self.should_compute_learning_dynamics and (
+                batch_step % self.configs["checkpointing"].save_every_n_steps == 0
+            )
+            ########################################################
+            #
+            # Forward Pass
+            #
+            ########################################################
+            _input_ids = torch.tensor(sub_batch["input_ids"], device=self.fabric.device)
+            input_ids = _input_ids[:, :-1]
+            labels = _input_ids[:, 1:]
+            if should_store_training_batch:
+                gathered_input_ids = self.fabric.all_gather(_input_ids)
+                # NOTE: On multi-GPU, we need to reshape the input_ids to be a 2D tensor; on
+                # a single GPU, the input_ids are already a 2D tensor.
+                if self.fabric.world_size > 1:
+                    gathered_input_ids = gathered_input_ids.reshape(
+                        -1, *gathered_input_ids.shape[2:]
+                    )
+                training_batch["input_ids"].extend(gathered_input_ids.tolist())
+            # Forward pass
+            model_output, _ = self.model(input_ids)
+            model_output = model_output.transpose(1, 2)
+            ########################################################
+            #
+            # Gradient accumulation
+            #
+            ########################################################
+            should_accumulate_gradients = (sub_batch_step + 1) % self.configs[
+                "training"
+            ].optimization.gradient_accumulation_steps != 0
+            with self.fabric.no_backward_sync(
+                self.model, enabled=should_accumulate_gradients
+            ):
+                loss = F.cross_entropy(model_output, labels)
+                self.fabric.backward(
+                    loss
+                    / self.configs["training"].optimization.gradient_accumulation_steps,
+                    model=self.model,
+                )
+                if torch.isnan(loss) or torch.isinf(loss):
+                    interval_inf_or_nan_count += 1
+                else:
+                    interval_loss += loss.item()
+                    interval_steps += 1
+            # NOTE: if we are not accumulating gradients, we should skip the logging and optimization steps
+            if should_accumulate_gradients:
+                continue
+            ########################################################
+            #
+            # Logging
+            #
+            ########################################################
+            if batch_step % self.configs["monitoring"].logging.log_every_n_steps == 0:
+                self._log_training_metrics(
+                    interval_loss=interval_loss,
+                    interval_steps=interval_steps,
+                    interval_inf_or_nan_count=interval_inf_or_nan_count,
+                    batch_step=batch_step,
+                )
+                interval_loss = torch.tensor(0.0, device=self.fabric.device)
+                interval_steps = torch.tensor(0, device=self.fabric.device)
+                interval_inf_or_nan_count = torch.tensor(0, device=self.fabric.device)
+            ########################################################
+            #
+            # Learning Dynamics Checkpointing
+            #
+            ########################################################
+            if batch_step % self.configs["checkpointing"].save_every_n_steps == 0:
+                if self.should_compute_learning_dynamics:
+                    self.log(f"Step {batch_step} -- 📈 Saving Learning Dynamics")
+                    # Training Batch Learning Dynamics
+                    training_batch_dataset = Dataset.from_dict(training_batch)
+                    learning_dynamics_train_states = compute_learning_dynamics_states(
+                        checkpointing_config=self.configs["checkpointing"],
+                        fabric=self.fabric,
+                        model=self.model,
+                        dataset=training_batch_dataset,
+                        compute_gradients=True,
+                    )
+                    save_learning_dynamics_states(
+                        checkpointing_config=self.configs["checkpointing"],
+                        checkpoint_step=batch_step,
+                        prefix="train",
+                        fabric=self.fabric,
+                        learning_dynamics_states=learning_dynamics_train_states,
+                        learning_dynamics_dataset=training_batch_dataset,
+                        tokenizer=self.tokenizer,
+                    )
+                    training_batch = {
+                        "input_ids": []
+                    }  # Resetting training_batch for next training batch
+                    # Validation Data Learning Dynamics
+                    if self.learning_dynamics_eval_dataset is not None:
+                        learning_dynamics_val_states = compute_learning_dynamics_states(
+                            checkpointing_config=self.configs["checkpointing"],
+                            fabric=self.fabric,
+                            model=self.model,
+                            dataset=self.learning_dynamics_eval_dataset,
+                            compute_gradients=True,
+                        )
+                        save_learning_dynamics_states(
+                            checkpointing_config=self.configs["checkpointing"],
+                            checkpoint_step=batch_step,
+                            prefix="val",
+                            fabric=self.fabric,
+                            learning_dynamics_states=learning_dynamics_val_states,
+                        )
+            ########################################################
+            #
+            # Optimization step
+            #
+            ########################################################
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+            self.lr_scheduler.step()
+            batch_step += 1
+            ########################################################
+            #
+            # Training Checkpointing and evaluation
+            #
+            ########################################################
+            if batch_step % self.configs["checkpointing"].save_every_n_steps == 0:
+                self.log(f"Step {batch_step} -- 💾 Saving Checkpoint")
+                save_checkpoint(
+                    configs=self.configs,
+                    checkpoint_step=batch_step,
+                    fabric=self.fabric,
+                    model=self.model,
+                    optimizer=self.optimizer,
+                    lr_scheduler=self.lr_scheduler,
+                    tokenizer=self.tokenizer,
+                )
+                if self.should_evaluate:
+                    evaluation_results = run_evaluation(
+                        evaluation_config=self.configs["evaluation"],
+                        checkpointing_config=self.configs["checkpointing"],
+                        fabric=self.fabric,
+                        model=self.model,
+                    )
+                    if evaluation_results is not None:
+                        self._log_evaluation_results(evaluation_results, batch_step)
+                        save_evaluation_results(
+                            checkpointing_config=self.configs["checkpointing"],
+                            fabric=self.fabric,
+                            evaluation_results=evaluation_results,
+                            checkpoint_step=batch_step,
+                        )
+            # Break if we've reached training steps
+            if batch_step >= self.configs["training"].max_steps:
+                break
+        return batch_step
+    ########################################################
+    #
+    # Trainer Logging Functinalities
+    #
+    ########################################################
+    def _log_training_metrics(
+        self,
+        interval_loss: torch.Tensor,
+        interval_steps: torch.Tensor,
+        interval_inf_or_nan_count: torch.Tensor,
+        batch_step: int,
+    ):
+        """
+        Gathers together the training metrics computed across all processes in distributed training
+        and logs them in a tree-style format.
+        """
+        gathered_interval_loss = self.fabric.all_reduce(
+            interval_loss, reduce_op="sum"
+        ).item()
+        gathered_interval_inf_or_nan_count = self.fabric.all_reduce(
+            interval_inf_or_nan_count, reduce_op="sum"
+        ).item()
+        gathered_interval_steps = self.fabric.all_reduce(
+            interval_steps, reduce_op="sum"
+        ).item()
+        avg_loss = (
+            gathered_interval_loss / gathered_interval_steps
+            if gathered_interval_steps > 0
+            else float("inf")
+        )
+        self.fabric.log("train/loss", avg_loss, step=batch_step)
+        self.fabric.log(
+            "trainer/inf_or_nan_count",
+            gathered_interval_inf_or_nan_count,
+            step=batch_step,
+        )
+        self.fabric.log(
+            "trainer/learning_rate",
+            self.lr_scheduler.get_last_lr()[0],
+            step=batch_step,
+        )
+        # Log to console in tree format
+        self.log(f"Step {batch_step} -- 🔄 Training Metrics")
+        self.log(f"├── Loss: {avg_loss:.4f}")
+        self.log(f"├── Learning Rate: {self.lr_scheduler.get_last_lr()[0]:.2e}")
+        self.log(f"└── Inf/NaN count: {gathered_interval_inf_or_nan_count}")
+    def _log_evaluation_results(
+        self, evaluation_results: Dict[str, Any], batch_step: int
+    ):
+        """Log model evaluation metrics to experiment tracking system and console."""
+        self.log(f"Step {batch_step} -- 📊 Evaluation Results")
+        for i, (metric, result) in enumerate(evaluation_results.items()):
+            prefix = "└──" if i == len(evaluation_results) - 1 else "├──"
+            self.log(f"{prefix} {metric}: {result}")
+            self.fabric.log(f"eval/{metric}", result, step=batch_step)
+    def _log_training_configuration(self):
+        """
+        Log training configuration details as well as runtime information about the hardware,
+        software, and batch settings.
+        This function is called at the beginning of the training loop to provide a summary of the
+        training configuration.
+        """
+        total_params = sum(p.numel() for p in self.model.parameters())
+        trainable_params = sum(
+            p.numel() for p in self.model.parameters() if p.requires_grad
+        )
+        global_batch_size = self.configs["data"].dataloader.batch_size
+        per_device_batch_size = self.train_dataloader.batch_size
+        gradient_accumulation_steps = self.configs[
+            "training"
+        ].optimization.gradient_accumulation_steps
+        device_type = ""
+        fabric_device = str(self.fabric.device)
+        if torch.cuda.is_available() and "cuda" in fabric_device:
+            device_type = torch.cuda.get_device_name(self.fabric.device)
+        elif torch.backends.mps.is_available() and "mps" in fabric_device:
+            device_type = "MPS (Apple Silicon)"
+        else:
+            device_type = "CPU"
+        training_config_path = os.path.join(
+            self.configs["checkpointing"].runs_dir,
+            self.configs["checkpointing"].run_name,
+            "training_config.yaml",
+        )
+        if os.path.exists(training_config_path):
+            self.log("=" * 50)
+            self.log("✨ Training Configuration")
+            self.log("=" * 50)
+            training_config = yaml.safe_load(open(training_config_path, "r"))
+            pretty_print_yaml_config(self.logger, training_config)
+        self.log("=" * 50)
+        self.log("⛭ Runtime Summary:")
+        self.log("=" * 50)
+        self.log(f"Starting from step: {self.initial_batch_step}")
+        self.log("Model Setup:")
+        self.log(f"└─ Total Parameters: {total_params:,}")
+        self.log(f"└─ Trainable Parameters: {trainable_params:,}")
+        self.log("Distributed Setup:")
+        self.log(f"└─ Number of Devices: {self.fabric.world_size}")
+        self.log(f"└─ Device Type: {device_type}")
+        self.log(
+            f"└─ Available Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
+            if torch.cuda.is_available()
+            else f"└─ Available Memory: {psutil.virtual_memory().total / 1e9:.2f} GB"
+        )
+        self.log("Software Setup:")
+        self.log(f"└─ Python Version: {platform.python_version()}")
+        self.log(f"└─ PyTorch Version: {torch.__version__}")
+        self.log(
+            f"└─ CUDA Version: {torch.version.cuda if torch.cuda.is_available() else 'N/A'}"
+        )
+        self.log(f"└─ Operating System: {platform.system()} {platform.release()}")
+        self.log("Batch Size Configuration:")
+        self.log(f"└─ Global Batch Size: {global_batch_size}")
+        self.log(f"└─ Per Device Batch Size: {per_device_batch_size}")
+        self.log(f"└─ Gradient Accumulation Steps: {gradient_accumulation_steps}")
+        self.log("=" * 50)
+    @rank_zero_only
+    def log(self, msg: str, level: int = logging.INFO) -> None:
+        """NOTE: Log messages only from rank zero process."""
+        self.logger.log(level, msg)

src/training/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Utility package that contains functions for the training process, e.g. initialization, logging, etc.
+"""
+# For convenience, we export the initialization functions here
+from .initialization import (
+    initialize_configuration,
+    initialize_dataloader,
+    initialize_dataset,
+    initialize_fabric,
+    initialize_hf_checkpointing,
+    initialize_logging,
+    initialize_lr_scheduler,
+    initialize_model,
+    initialize_optimizer,
+    initialize_run_dir,
+    initialize_tokenizer,
+    initialize_wandb,
+)
+__all__ = [
+    "initialize_configuration",
+    "initialize_dataloader",
+    "initialize_dataset",
+    "initialize_fabric",
+    "initialize_hf_checkpointing",
+    "initialize_logging",
+    "initialize_lr_scheduler",
+    "initialize_model",
+    "initialize_optimizer",
+    "initialize_run_dir",
+    "initialize_tokenizer",
+    "initialize_wandb",
+]

src/training/utils/data.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Utilities for data loading and processing.
+"""
+from torch.utils.data import IterableDataset
+class ShardedIterableDataset(IterableDataset):
+    """
+    A super simple implementation of a sharded iterable dataset that enables DataParallelism
+    across multiple workers. Ensures that each worker gets a unique shard of the dataset.
+    NOTE: Also works fine if there is only one worker.
+    """
+    def __init__(self, dataset, rank, world_size):
+        self.dataset = dataset
+        self.rank = rank
+        self.world_size = world_size
+    def __iter__(self):
+        iterator = iter(self.dataset)
+        # NOTE: Start by skipping to this worker's shard
+        for _ in range(self.rank):
+            next(iterator)
+        # NOTE: Yield every world_size-th item
+        while True:
+            try:
+                yield next(iterator)
+                # Skip other workers' samples
+                for _ in range(self.world_size - 1):
+                    next(iterator)
+            except StopIteration:
+                break

src/training/utils/initialization.py ADDED Viewed

	@@ -0,0 +1,702 @@

+"""
+Utilities for initializing components of the training process.
+Here, we initialize all of the components that are part of the learning process. From logging,
+and checkpointing to the optimizer to the dataset and the dataloader, this file contains the
+logic for setting up the classes and functions that are used in the training loop.
+As always, this code is meant to be basic. We hard-code the obvious defaults, and leave the
+more experimental stuff to you.
+"""
+import logging
+import math
+import os
+import warnings
+from dataclasses import fields, is_dataclass
+from datetime import datetime
+from typing import Dict, Optional, Union
+import lightning as L
+import torch
+import yaml
+from datasets import Dataset, DownloadConfig, load_dataset
+from datasets import config as datasets_config
+from huggingface_hub import add_collection_item, create_branch, create_repo
+from lightning.fabric.loggers import Logger as FabricLogger
+from lightning.fabric.utilities.rank_zero import rank_zero_only
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer
+import wandb
+from src.config import (
+    CheckpointingConfig,
+    DataConfig,
+    EvaluationConfig,
+    ModelConfig,
+    MonitoringConfig,
+    TrainingConfig,
+)
+from src.model import PicoDecoder
+from src.training.utils.io import use_backoff
+from wandb.integration.lightning.fabric import WandbLogger
+warnings.filterwarnings(
+    "ignore",
+    message=".*This integration is tested and supported for lightning Fabric.*",
+)
+warnings.filterwarnings(
+    "ignore",
+    message=".*Please report any issues to.*",
+)
+########################################################
+#
+# Basic Initialization
+#
+########################################################
+def _apply_config_overrides(config, overrides: dict):
+    """Recursively apply configuration overrides to a dataclass config object.
+    Args:
+        config: Base configuration object (must be a dataclass)
+        overrides: Dictionary of override values matching config structure
+    Returns:
+        Modified config object with overrides to the config.
+    """
+    for field in fields(config):
+        field_value = getattr(config, field.name)
+        if is_dataclass(field_value):
+            _apply_config_overrides(field_value, overrides.get(field.name, {}))
+        else:
+            if field.name in overrides:
+                setattr(config, field.name, overrides[field.name])
+    return config
+def initialize_configuration(
+    config_path: Optional[str] = None,
+) -> Dict[
+    str,
+    Union[
+        DataConfig,
+        ModelConfig,
+        TrainingConfig,
+        EvaluationConfig,
+        MonitoringConfig,
+        CheckpointingConfig,
+    ],
+]:
+    """Initialize configuration objects with optional overrides from a YAML file.
+    This function initializes all of the configuration objects, and then applies
+    any overrides from the config_path file. If no config_path is provided,
+    the function will use the default configuration objects.
+    Args:
+        config_path: Path to a YAML file containing configuration overrides.
+    Returns:
+        A dictionary containing the initialized configuration objects.
+    """
+    data_config = DataConfig()
+    model_config = ModelConfig()
+    training_config = TrainingConfig()
+    evaluation_config = EvaluationConfig()
+    monitoring_config = MonitoringConfig()
+    checkpointing_config = CheckpointingConfig()
+    if config_path:
+        overrides = yaml.safe_load(open(config_path, "r"))
+        data_config = _apply_config_overrides(data_config, overrides.get("data", {}))
+        model_config = _apply_config_overrides(model_config, overrides.get("model", {}))
+        training_config = _apply_config_overrides(
+            training_config, overrides.get("training", {})
+        )
+        evaluation_config = _apply_config_overrides(
+            evaluation_config, overrides.get("evaluation", {})
+        )
+        monitoring_config = _apply_config_overrides(
+            monitoring_config, overrides.get("monitoring", {})
+        )
+        checkpointing_config = _apply_config_overrides(
+            checkpointing_config, overrides.get("checkpointing", {})
+        )
+    configs = {
+        "data": data_config,
+        "model": model_config,
+        "training": training_config,
+        "evaluation": evaluation_config,
+        "monitoring": monitoring_config,
+        "checkpointing": checkpointing_config,
+    }
+    return configs
+def initialize_run_dir(checkpointing_config: CheckpointingConfig) -> str:
+    """Initialize a directory for the current training run.
+    Creates a unique directory for storing training, evaluation, and logging artifacts.
+    If no run name is specified in the config, generates a timestamp-based name.
+    Args:
+        checkpointing_config: Configuration object containing run settings.
+            NOTE: Must have a 'run_name' attribute that can be None, in which case
+            a timestamp-based name will be generated.
+    Returns:
+        str: The path to the run directory.
+    """
+    run_name = checkpointing_config.run_name
+    if run_name is None:
+        run_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        checkpointing_config.run_name = run_name
+    run_dir = os.path.join(checkpointing_config.runs_dir, run_name)
+    os.makedirs(run_dir, exist_ok=True)
+    return run_dir
+def initialize_fabric(
+    training_config: TrainingConfig, wandb_logger: Optional[FabricLogger] = None
+):
+    """Initialize Lightning Fabric for distributed training.
+    Sets up a Lightning Fabric instance with the specified configuration for
+    handling distributed training, mixed precision, and logging.
+    Args:
+        training_config: Configuration object containing fabric settings
+            (accelerator, precision, devices, etc.).
+        wandb_logger: Optional weights and biases logger instance for experiment tracking
+    Returns:
+        L.Fabric: Initialized Lightning Fabric instance.
+    Example:
+        >>> fabric = initialize_fabric(training_config, wandb_logger)
+    """
+    total_devices = (
+        training_config.fabric.num_devices * training_config.fabric.num_nodes
+    )
+    if total_devices > 1:
+        strategy = "deepspeed_stage_2"
+    else:
+        strategy = "auto"  # Sets up SingleDevice Strategy by default
+    # NOTE: The strategy is set to use either DeepSpeed (Zero Stage 2) on multi-GPU,
+    # or SingleDevice Strategy on single-GPU set ups. If you'd like to use a different strategy,
+    # you can change the strategy flag in the fabric initialization, but be aware that this might
+    # cause issues with checkpointing, evaluation, etc.
+    fabric = L.Fabric(
+        accelerator=training_config.fabric.accelerator,
+        precision=training_config.fabric.precision,
+        devices=training_config.fabric.num_devices,
+        num_nodes=training_config.fabric.num_nodes,
+        loggers=[wandb_logger] if wandb_logger is not None else None,
+        strategy=strategy,
+    )
+    fabric.launch()
+    return fabric
+########################################################
+#
+# Dataset and Tokenization Initialization
+#
+########################################################
+@use_backoff(max_retries=20)
+def initialize_dataset(
+    data_config: DataConfig,
+    fabric: L.Fabric,
+    initial_batch_step: Optional[int] = 0,
+    return_fast_forward_steps: bool = False,
+):
+    """Initialize dataset based on the given config.
+    This function will return a dataset object, and optionally a fast_forward_steps value.
+    The fast_forward_steps value is the number of steps that we need to fast-forward an iterator by,
+    so that we can continue from a ertain batch of data we would have seen had training not previously
+    stopped. Depending on how the dataset is loaded, the amount of steps to fast-forward may be
+    different from the initial_batch_step value.
+    NOTE: This functionality is primarily useful for streaming datasets (which for large
+    datasets is most of the time).
+    Args:
+        data_config: Configuration object containing dataset settings.
+        fabric: A Lightning Fabric instance.
+        initial_batch_step: The initial batch step to fast-forward to.
+        return_fast_forward_steps: Whether to return the fast-forward steps value.
+    Returns:
+        Dataset: Initialized dataset object.
+        Optional[int]: Number of steps to fast-forward the iterator by, if return_fast_forward_steps is True.
+    """
+    datasets_config.STREAMING_READ_MAX_RETRIES = 40  # default is 20
+    datasets_config.STREAMING_READ_RETRY_INTERVAL = 10  # default is 5
+    download_config = DownloadConfig(
+        max_retries=20,  # default is 1 and can lead to pre-mature HTTPS errors
+    )
+    fast_forward_steps = 0
+    if data_config.dataset.name == "pico-lm/pretokenized-dolma":
+        # NOTE: We know that the dataset is sharded into 10,000 shards, so we can easily compute
+        # the data file that we need to load in that contains the batch of data at
+        # initial_batch_step.
+        if initial_batch_step is not None:
+            examples_per_shard = 20_480
+            total_shards = 10_000
+            batches_per_shard = examples_per_shard // data_config.dataloader.batch_size
+            shard_idx = initial_batch_step // batches_per_shard
+            data_files = [
+                f"data/train-{str(_shard_idx).zfill(5)}-of-{total_shards}.parquet"
+                for _shard_idx in range(shard_idx, total_shards)
+            ]
+            fast_forward_steps = initial_batch_step % batches_per_shard
+        else:
+            data_files = None
+        base_dataset = load_dataset(
+            data_config.dataset.name,
+            split="train",
+            streaming=True,
+            data_files=data_files,
+            download_config=download_config,
+        )
+    else:
+        # NOTE: For other datasets, you might want to add some custom loading logic, especially
+        # to help with loading or fast-forwarding to the correct batch.
+        base_dataset = load_dataset(
+            data_config.dataset.name,
+            split="train",
+            streaming=True,
+            download_config=download_config,
+        )
+    if data_config.dataset.name == "pico-lm/pretokenized-dolma":
+        from .data import ShardedIterableDataset
+        # NOTE: We wrap the dataset in a ShardedIterableDataset, which is a custom class that
+        # allows us to shard an iterable dataset across multiple processes. This is useful for
+        # distributed training, where we want data-parallelism.
+        dataset = ShardedIterableDataset(
+            base_dataset, fabric.global_rank, fabric.world_size
+        )
+    else:
+        dataset = base_dataset
+    if return_fast_forward_steps:
+        return dataset, fast_forward_steps
+    else:
+        return dataset
+def initialize_tokenizer(data_config: DataConfig):
+    """Initialize the tokenizer for text processing.
+    This function can be extended to include custom tokenization logic.
+    Args:
+        data_config: Configuration object containing tokenizer settings.
+    Returns:
+        AutoTokenizer: A HuggingFace tokenizer instance.
+    """
+    return AutoTokenizer.from_pretrained(data_config.tokenizer.name)
+def initialize_dataloader(
+    data_config: DataConfig,
+    training_config: TrainingConfig,
+    fabric: L.Fabric,
+    dataset: Dataset,
+):
+    """Initialize the DataLoader for efficient batch processing.
+    Creates a PyTorch DataLoader that handles batching and data loading for training.
+    Configured specifically for streaming tokenized text datasets.
+    You might also want to extend this function to add a sampler, or some sort of custom
+    collate function. For the default dataset, we don't need any of this, because the data are
+    pre-shuffled, and pre-tokenized.
+    Args:
+        data_config: Configuration object containing dataloader settings.
+        training_config: Configuration object containing training settings.
+        fabric: A Lightning Fabric instance.
+        dataset: A HuggingFace Dataset object containing tokenized text data.
+            Expected to have 'input_ids' field in its items.
+    Returns:
+        DataLoader: PyTorch DataLoader instance configured for the dataset.
+    """
+    def _collate_fn(batch):
+        return {"input_ids": [entry["input_ids"] for entry in batch]}
+    sub_batch_size = data_config.dataloader.batch_size // (
+        fabric.world_size * training_config.optimization.gradient_accumulation_steps
+    )
+    # NOTE: We use the sub-batch size for the dataloader, which is the full batch size
+    # divided by the gradient accumulation steps. This ensures that the effective batch size
+    # is correct.
+    return DataLoader(
+        dataset,
+        batch_size=sub_batch_size,
+        shuffle=False,  # Keep sequential for streaming datasets
+        pin_memory=True,  # Speeds up transfer to GPU
+        collate_fn=_collate_fn,
+    )
+########################################################
+#
+# Model Initialization
+#
+########################################################
+def initialize_model(model_config: ModelConfig):
+    """Initialize the model for training.
+    Loads in a given model implemented in the `src.model` package and returns it.
+    NOTE: out of the box we currently only support the PicoDecoder model (a causal transformer
+    language model). If you'd like to implement your own model, you can do so by adding a new
+    model class in the `src.model` package, and then adding a new entry here.
+    Args:
+        model_config: Configuration object containing model settings.
+    Returns:
+        PyTorch model instance.
+    """
+    if model_config.model_type == "pico_decoder":
+        return PicoDecoder(model_config)
+    else:
+        raise ValueError(f"Invalid model type: {model_config.model_type}")
+########################################################
+#
+# Optimizer and Scheduler
+#
+########################################################
+def initialize_optimizer(training_config: TrainingConfig, model: torch.nn.Module):
+    """Initialize the optimizer for model training.
+    Creates an optimizer instance based on the configuration settings.
+    Add whatever other optimizers you want here.
+    Args:
+        training_config: Configuration object containing optimizer settings.
+            Must have:
+            - optimization.optimizer (str): Name of the optimizer ("adamw")
+            - optimization.lr (float): Learning rate for the optimizer
+        model: PyTorch model whose parameters will be optimized.
+    Returns:
+        torch.optim.Optimizer: Configured optimizer instance.
+    """
+    if training_config.optimization.optimizer == "adamw":
+        optimizer = torch.optim.AdamW(
+            model.parameters(), lr=training_config.optimization.lr
+        )
+    else:
+        raise ValueError(f"Invalid optimizer: {training_config.optimization.optimizer}")
+    return optimizer
+def initialize_lr_scheduler(
+    training_config: TrainingConfig, optimizer: torch.optim.Optimizer
+):
+    """Initialize a learning rate scheduler with warmup and decay.
+    The default is a learning rate scheduler that implements a linear warmup followed by
+    linear decay. The learning rate increases linearly from 0 to the initial lr
+    during warmup, then decreases linearly to 0 during the remaining steps.
+    Add other types of learning rate schedulers here.
+    Args:
+        training_config: Configuration object containing optimizer and scheduler settings.
+        optimizer: PyTorch optimizer whose learning rate will be scheduled.
+    Returns:
+        torch.optim.lr_scheduler.LambdaLR: Learning rate scheduler instance.
+    """
+    if training_config.optimization.lr_scheduler == "linear_with_warmup":
+        # Credit where credit is due:
+        # https://github.com/huggingface/transformers/blob/e71a01a104dd663c730e494eb0b6467bb51df357/src/transformers/optimization.py#L102
+        def _lr_lambda(curr_step, num_warmup_steps, max_steps):
+            if curr_step < num_warmup_steps:
+                return float(curr_step) / float(max(1, num_warmup_steps))
+            else:
+                return max(
+                    0.0,
+                    float(max_steps - curr_step)
+                    / float(max(1, max_steps - num_warmup_steps)),
+                )
+        lr_lambda = lambda step: _lr_lambda(  # noqa: E731
+            step,
+            training_config.optimization.lr_warmup_steps,
+            training_config.max_steps,
+        )
+        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
+            optimizer,
+            lr_lambda,
+        )
+    elif training_config.optimization.lr_scheduler == "cosine":
+        # Cosine decay with warmup: linear warmup followed by cosine decay
+        # This provides sustained learning over long training runs
+        def _cosine_lr_lambda(curr_step, num_warmup_steps, max_steps):
+            if curr_step < num_warmup_steps:
+                # Linear warmup
+                return float(curr_step) / float(max(1, num_warmup_steps))
+            else:
+                # Cosine decay to 0.1 * initial_lr (not to 0)
+                progress = float(curr_step - num_warmup_steps) / float(
+                    max(1, max_steps - num_warmup_steps)
+                )
+                return max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
+        lr_lambda = lambda step: _cosine_lr_lambda(  # noqa: E731
+            step,
+            training_config.optimization.lr_warmup_steps,
+            training_config.max_steps,
+        )
+        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
+            optimizer,
+            lr_lambda,
+        )
+    else:
+        raise ValueError(
+            f"Invalid learning rate scheduler: {training_config.optimization.lr_scheduler}"
+        )
+    return lr_scheduler
+########################################################
+#
+# Experiment Monitoring (Logging, Experiment Tracking, etc.)
+#
+########################################################
+def _initialize_log_file(checkpointing_config: CheckpointingConfig) -> str:
+    """Create and initialize a timestamped log file in the run's log directory.
+    Sets up a log file with a unique timestamp in the run's logging directory.
+    Creates the necessary directory structure if it doesn't exist.
+    Directory Structure:
+        {checkpointing_config.runs_dir}/
+        └── {checkpointing_config.run_name}/
+            └── {checkpointing_config.logs_dir}/
+                └── log_YYYYMMDD_HHMMSS.txt
+    Args:
+        checkpointing_config: Configuration object containing checkpointing settings.
+    Returns:
+        str: Absolute path to the created log file.
+    """
+    run_dir = os.path.join(checkpointing_config.runs_dir, checkpointing_config.run_name)
+    logs_dir = os.path.join(run_dir, checkpointing_config.logs_dir)
+    os.makedirs(logs_dir, exist_ok=True)
+    # datetime stamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_file_name = f"log_{timestamp}.log"
+    log_file_path = os.path.join(logs_dir, log_file_name)
+    open(log_file_path, "w").close()  # Create an empty log file
+    return log_file_path
+@use_backoff()
+def initialize_wandb(
+    monitoring_config: MonitoringConfig, checkpointing_config: CheckpointingConfig
+):
+    """Initialize Weights and Biases.
+    This function initializes Weights and Biases based on the configuration settings.
+    Args:
+        monitoring_config: Configuration object containing monitoring settings.
+        checkpointing_config: Configuration object containing checkpointing settings.
+    Returns:
+        Optional[WandbLogger]: An experiment tracker instance.
+    """
+    assert (
+        monitoring_config.wandb.project is not None
+        and monitoring_config.wandb.project != ""
+    ), "Wandb project must be provided if wandb is to be used."
+    assert (
+        monitoring_config.wandb.entity is not None
+        and monitoring_config.wandb.entity != ""
+    ), "Wandb entity must be provided if wandb is to be used."
+    _run_id = None
+    if checkpointing_config.training.auto_resume:
+        # If we are loading a checkpoint, we can try to find the run id of the previous run
+        previous_runs = wandb.Api().runs(
+            path=f"{monitoring_config.wandb.entity}/{monitoring_config.wandb.project}",
+            filters={"display_name": checkpointing_config.run_name},
+        )
+        try:
+            if len(previous_runs) == 1:
+                _run_id = previous_runs[0].id
+        except ValueError:
+            pass
+    wandb_logger = WandbLogger(
+        project=monitoring_config.wandb.project,
+        entity=monitoring_config.wandb.entity,
+        id=_run_id,
+        name=checkpointing_config.run_name,
+    )
+    return wandb_logger
+@rank_zero_only
+def initialize_logging(
+    monitoring_config: MonitoringConfig,
+    checkpointing_config: CheckpointingConfig,
+    fabric: L.Fabric,
+):
+    """Initialize logging system with default logging, to file and console.
+    The default logging system uses a file handler and a stream handler.
+    NOTE: this function is only called on rank 0.
+    Args:
+        monitoring_config: Configuration object containing monitoring settings.
+        checkpointing_config: Configuration object containing checkpointing settings.
+    Returns:
+        logger: Standard Python logger configured for file and console output
+    """
+    # ---- Standard Local Logger ---- #
+    logger = logging.getLogger("pico-train")
+    logger.setLevel(logging.INFO)
+    # Create file handler
+    log_file_path = _initialize_log_file(checkpointing_config)
+    file_handler = logging.FileHandler(log_file_path, encoding="utf-8")
+    file_handler.setLevel(monitoring_config.logging.log_level)
+    # Create formatter and add it to the handler
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    file_handler.setFormatter(formatter)
+    # Add the handler to the logger
+    logger.addHandler(file_handler)
+    # Add a stream handler for console output
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(monitoring_config.logging.log_level)
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(stream_handler)
+    return logger
+########################################################
+#
+# HuggingFace/Remote Checkpointing
+#
+########################################################
+@rank_zero_only
+@use_backoff()
+def initialize_hf_checkpointing(
+    checkpointing_config: CheckpointingConfig, fabric: L.Fabric
+):
+    """Initialize HuggingFace Checkpointing.
+    Creates a HuggingFace repository if it doesn't exist, and creates a branch named after the run.
+    NOTE: this function is only called on rank 0.
+    Args:
+        checkpointing_config: Configuration object containing checkpointing settings; must have
+            a 'hf_checkpoint' attribute that specifies the HuggingFace repository id and
+            collection slug (if applicable) to save the checkpoint to.
+    Raises:
+        RuntimeError: If unable to create HuggingFace repository after multiple attempts.
+    """
+    huggingface_repo_id = checkpointing_config.hf_checkpoint.repo_id
+    assert (
+        huggingface_repo_id is not None and huggingface_repo_id != ""
+    ), "hf_checkpoint.repo_id must be provided."
+    repo = create_repo(huggingface_repo_id, exist_ok=True)
+    # can create a repo without a specified namespace (will default to username)
+    # however the rest of the HF calls need the fully qualified name
+    # this is returned by create repo, so we update the config for later calls
+    checkpointing_config.hf_checkpoint.repo_id = repo.repo_id
+    huggingface_repo_id = repo.repo_id
+    if checkpointing_config.hf_checkpoint.collection_slug:
+        add_collection_item(
+            checkpointing_config.hf_checkpoint.collection_slug,
+            huggingface_repo_id,
+            repo.repo_type,
+            exists_ok=True,
+        )
+    create_branch(
+        repo_id=huggingface_repo_id,
+        branch=checkpointing_config.run_name,
+        exist_ok=True,
+    )

src/training/utils/io.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""Defines a retry wrapper for io operations."""
+import time
+from functools import wraps
+def use_backoff(max_retries=2, initial_delay=1, backoff_factor=2):
+    """
+    Universal retry wrapper with exponential backoff for any function, but primarily for loading
+    and storing HuggingFace datasets and objects.
+    Example usage:
+    >>> @use_backoff(max_retries=10, delay=1, backoff_factor=2)
+    >>> def important_io_operation(x):
+    >>>     return x + 1
+    Args:
+        fn: Function to execute
+        max_retries: Maximum number of retry attempts (default: 3)
+        delay: Initial delay between retries in seconds (default: 1)
+        backoff_factor: Multiplier for delay between retries (default: 2)
+    Returns:
+        A wrapper function that will retry the function fn up to max_retries times with exponential backoff
+    Raises:
+        Exception: If all retries fail
+    """
+    def _decorator(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            current_delay = initial_delay
+            last_exception = None
+            for attempt in range(max_retries):
+                try:
+                    return fn(*args, **kwargs)
+                except Exception as e:
+                    last_exception = e
+                    if attempt < max_retries - 1:  # Don't sleep on the last attempt
+                        time.sleep(current_delay)
+                        current_delay *= backoff_factor
+            raise Exception(
+                f"IO Operation failed after {max_retries} attempts: {str(last_exception)}"
+            )
+        return wrapper
+    return _decorator

src/training/utils/logging.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Miscellaneous logging utilities.
+"""
+from io import StringIO
+import yaml
+from lightning.fabric.utilities.rank_zero import rank_zero_only
+from rich.console import Console
+from rich.panel import Panel
+@rank_zero_only
+def pretty_print_yaml_config(logger, config: dict) -> None:
+    """
+    Pretty print config with rich formatting. Assumes that the config is already saved as a
+    dictionary - this can be done by calling `asdict` on the dataclass or loading in the config
+    from a yaml file.
+    NOTE: this function is only called on rank 0.
+    Args:
+        logger: Logger object to log the formatted output to.
+        config: Dictionary containing the config to pretty print.
+    """
+    # Create string buffer
+    output = StringIO()
+    console = Console(file=output, force_terminal=False)
+    # Convert to YAML string first
+    yaml_str = yaml.dump(
+        config, default_flow_style=False, sort_keys=False, Dumper=yaml.SafeDumper
+    )
+    # Create formatted panel
+    panel = Panel(
+        yaml_str,
+        border_style="blue",
+        padding=(0, 1),  # Reduced padding
+        expand=False,  # Don't expand to terminal width
+    )
+    # Print to buffer
+    console.print(panel)
+    # Log the formatted output
+    for line in output.getvalue().splitlines():
+        logger.info(line)