Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/config.json +1 -1
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/fabric_state/checkpoint.pt +1 -1
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/generation_config.json +1 -1
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_activations.pt +3 -0
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/data-00000-of-00001.arrow +3 -0
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/dataset_info.json +19 -0
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/state.json +13 -0
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_gradients.pt +3 -0
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_weights.pt +3 -0
pico-decoder-tiny-dolma5M-v1/logs/log_20250829_224829.log +128 -0

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/config.json CHANGED Viewed

@@ -17,6 +17,6 @@
   "norm_eps": 1e-06,
   "position_emb_theta": 10000.0,
   "torch_dtype": "float32",
-  "transformers_version": "4.48.3",
   "vocab_size": 50304
 }

   "norm_eps": 1e-06,
   "position_emb_theta": 10000.0,
   "torch_dtype": "float32",
+  "transformers_version": "4.55.4",
   "vocab_size": 50304
 }

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/fabric_state/checkpoint.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3add1b50ed75802e72e92667b9a06fb9799a3a0fc8a12baa5bb43d4c1941a2ab
 size 135543171

 version https://git-lfs.github.com/spec/v1
+oid sha256:8fd044dad1851e94ba0ceb88090254ed7a248d8d3119d6688a2c7dd15e3b7865
 size 135543171

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/generation_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "transformers_version": "4.48.3",
   "vocab_size": 50304
 }

 {
+  "transformers_version": "4.55.4",
   "vocab_size": 50304
 }

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_activations.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8910ad571404374d5d26adf3151cf7749b571908b14ece94076013fc2ced8b92
+size 33819

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:254891345b1a9d809e9c5c0a1532693b94d769025a317ad82bc418dfa3f7b40b
+size 71640

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "3da9a89786e6494d",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_gradients.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d70f9fca27cf0bed975a63ab4f6cf4925792f880630b8d3da7180c89deadb4f
+size 2371527

pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa1c848f47e8f22b70e5ae91c318ab8432a3b05f4456b771304768c0a0c42431
+size 2371443

pico-decoder-tiny-dolma5M-v1/logs/log_20250829_224829.log ADDED Viewed

	@@ -0,0 +1,128 @@

+2025-08-29 22:50:26 - pico-train - INFO - Step 20000 -- 📊 Evaluation Results
+2025-08-29 22:50:26 - pico-train - INFO - └── paloma: 1.8399778163273925e+24
+2025-08-29 22:50:26 - pico-train - INFO - ==================================================
+2025-08-29 22:50:26 - pico-train - INFO - ✨ Training Configuration
+2025-08-29 22:50:26 - pico-train - INFO - ==================================================
+2025-08-29 22:50:26 - pico-train - INFO - ╭─────────────────────────────────────────────────────╮
+2025-08-29 22:50:26 - pico-train - INFO - │ checkpointing:                                      │
+2025-08-29 22:50:26 - pico-train - INFO - │   checkpoints_dir: checkpoints                      │
+2025-08-29 22:50:26 - pico-train - INFO - │   evaluation:                                       │
+2025-08-29 22:50:26 - pico-train - INFO - │     eval_results_dir: eval_results                  │
+2025-08-29 22:50:26 - pico-train - INFO - │   fabric_checkpoint_dir: fabric_state               │
+2025-08-29 22:50:26 - pico-train - INFO - │   fabric_checkpoint_filename: checkpoint.pt         │
+2025-08-29 22:50:26 - pico-train - INFO - │   hf_checkpoint:                                    │
+2025-08-29 22:50:26 - pico-train - INFO - │     collection_slug: null                           │
+2025-08-29 22:50:26 - pico-train - INFO - │     repo_id: ThomasTheMaker/pico-decoder-tiny       │
+2025-08-29 22:50:26 - pico-train - INFO - │   learning_dynamics:                                │
+2025-08-29 22:50:26 - pico-train - INFO - │     batch_size: 1                                   │
+2025-08-29 22:50:26 - pico-train - INFO - │     eval_data: null                                 │
+2025-08-29 22:50:26 - pico-train - INFO - │     layer_suffixes:                                 │
+2025-08-29 22:50:26 - pico-train - INFO - │     - attention.v_proj                              │
+2025-08-29 22:50:26 - pico-train - INFO - │     - attention.o_proj                              │
+2025-08-29 22:50:26 - pico-train - INFO - │     - swiglu.w_2                                    │
+2025-08-29 22:50:26 - pico-train - INFO - │     sequence_idx: -1                                │
+2025-08-29 22:50:26 - pico-train - INFO - │   learning_dynamics_dir: learning_dynamics          │
+2025-08-29 22:50:26 - pico-train - INFO - │   logs_dir: logs                                    │
+2025-08-29 22:50:26 - pico-train - INFO - │   run_name: pico-decoder-tiny-dolma5M-v1            │
+2025-08-29 22:50:26 - pico-train - INFO - │   runs_dir: runs                                    │
+2025-08-29 22:50:26 - pico-train - INFO - │   save_every_n_steps: 500                           │
+2025-08-29 22:50:26 - pico-train - INFO - │   save_to_hf: true                                  │
+2025-08-29 22:50:26 - pico-train - INFO - │   training:                                         │
+2025-08-29 22:50:26 - pico-train - INFO - │     auto_resume: true                               │
+2025-08-29 22:50:26 - pico-train - INFO - │ data:                                               │
+2025-08-29 22:50:26 - pico-train - INFO - │   dataloader:                                       │
+2025-08-29 22:50:26 - pico-train - INFO - │     batch_size: 4                                   │
+2025-08-29 22:50:26 - pico-train - INFO - │   dataset:                                          │
+2025-08-29 22:50:26 - pico-train - INFO - │     name: ThomasTheMaker/pretokenized-dolma-5M      │
+2025-08-29 22:50:26 - pico-train - INFO - │   tokenizer:                                        │
+2025-08-29 22:50:26 - pico-train - INFO - │     name: allenai/OLMo-7B-0724-hf                   │
+2025-08-29 22:50:26 - pico-train - INFO - │     vocab_size: 50304                               │
+2025-08-29 22:50:26 - pico-train - INFO - │ evaluation:                                         │
+2025-08-29 22:50:26 - pico-train - INFO - │   metrics:                                          │
+2025-08-29 22:50:26 - pico-train - INFO - │   - paloma                                          │
+2025-08-29 22:50:26 - pico-train - INFO - │   paloma:                                           │
+2025-08-29 22:50:26 - pico-train - INFO - │     batch_size: 1                                   │
+2025-08-29 22:50:26 - pico-train - INFO - │     dataset_name: pico-lm/pretokenized-paloma-tinsy │
+2025-08-29 22:50:26 - pico-train - INFO - │     dataset_split: val                              │
+2025-08-29 22:50:26 - pico-train - INFO - │     max_length: 2048                                │
+2025-08-29 22:50:26 - pico-train - INFO - │ model:                                              │
+2025-08-29 22:50:26 - pico-train - INFO - │   activation_hidden_dim: 384                        │
+2025-08-29 22:50:26 - pico-train - INFO - │   attention_n_heads: 12                             │
+2025-08-29 22:50:26 - pico-train - INFO - │   attention_n_kv_heads: 4                           │
+2025-08-29 22:50:26 - pico-train - INFO - │   batch_size: 1024                                  │
+2025-08-29 22:50:26 - pico-train - INFO - │   d_model: 96                                       │
+2025-08-29 22:50:26 - pico-train - INFO - │   max_seq_len: 2048                                 │
+2025-08-29 22:50:26 - pico-train - INFO - │   model_type: pico_decoder                          │
+2025-08-29 22:50:26 - pico-train - INFO - │   n_layers: 12                                      │
+2025-08-29 22:50:26 - pico-train - INFO - │   norm_eps: 1.0e-06                                 │
+2025-08-29 22:50:26 - pico-train - INFO - │   position_emb_theta: 10000.0                       │
+2025-08-29 22:50:26 - pico-train - INFO - │   vocab_size: 50304                                 │
+2025-08-29 22:50:26 - pico-train - INFO - │ monitoring:                                         │
+2025-08-29 22:50:26 - pico-train - INFO - │   logging:                                          │
+2025-08-29 22:50:26 - pico-train - INFO - │     log_every_n_steps: 25                           │
+2025-08-29 22:50:26 - pico-train - INFO - │     log_level: INFO                                 │
+2025-08-29 22:50:26 - pico-train - INFO - │   save_to_wandb: false                              │
+2025-08-29 22:50:26 - pico-train - INFO - │   wandb:                                            │
+2025-08-29 22:50:26 - pico-train - INFO - │     entity: boymyc                                  │
+2025-08-29 22:50:26 - pico-train - INFO - │     project: pico-decoder-tiny                      │
+2025-08-29 22:50:26 - pico-train - INFO - │ training:                                           │
+2025-08-29 22:50:26 - pico-train - INFO - │   fabric:                                           │
+2025-08-29 22:50:26 - pico-train - INFO - │     accelerator: cuda                               │
+2025-08-29 22:50:26 - pico-train - INFO - │     num_devices: 1                                  │
+2025-08-29 22:50:26 - pico-train - INFO - │     num_nodes: 1                                    │
+2025-08-29 22:50:26 - pico-train - INFO - │     precision: bf16-mixed                           │
+2025-08-29 22:50:26 - pico-train - INFO - │   max_steps: 20000                                  │
+2025-08-29 22:50:26 - pico-train - INFO - │   optimization:                                     │
+2025-08-29 22:50:26 - pico-train - INFO - │     gradient_accumulation_steps: 4                  │
+2025-08-29 22:50:26 - pico-train - INFO - │     lr: 5.0e-05                                     │
+2025-08-29 22:50:26 - pico-train - INFO - │     lr_scheduler: cosine                            │
+2025-08-29 22:50:26 - pico-train - INFO - │     lr_warmup_steps: 8000                           │
+2025-08-29 22:50:26 - pico-train - INFO - │     optimizer: adamw                                │
+2025-08-29 22:50:26 - pico-train - INFO - │                                                     │
+2025-08-29 22:50:26 - pico-train - INFO - ╰─────────────────────────────────────────────────────╯
+2025-08-29 22:50:26 - pico-train - INFO - ==================================================
+2025-08-29 22:50:26 - pico-train - INFO - ⛭ Runtime Summary:
+2025-08-29 22:50:26 - pico-train - INFO - ==================================================
+2025-08-29 22:50:26 - pico-train - INFO - Starting from step: 20000
+2025-08-29 22:50:26 - pico-train - INFO - Model Setup:
+2025-08-29 22:50:26 - pico-train - INFO - └─ Total Parameters: 11,282,784
+2025-08-29 22:50:26 - pico-train - INFO - └─ Trainable Parameters: 11,282,784
+2025-08-29 22:50:26 - pico-train - INFO - Distributed Setup:
+2025-08-29 22:50:26 - pico-train - INFO - └─ Number of Devices: 1
+2025-08-29 22:50:26 - pico-train - INFO - └─ Device Type: NVIDIA GeForce RTX 5090
+2025-08-29 22:50:26 - pico-train - INFO - └─ Available Memory: 33.68 GB
+2025-08-29 22:50:26 - pico-train - INFO - Software Setup:
+2025-08-29 22:50:26 - pico-train - INFO - └─ Python Version: 3.10.12
+2025-08-29 22:50:26 - pico-train - INFO - └─ PyTorch Version: 2.8.0+cu128
+2025-08-29 22:50:26 - pico-train - INFO - └─ CUDA Version: 12.8
+2025-08-29 22:50:26 - pico-train - INFO - └─ Operating System: Linux 6.8.0-63-generic
+2025-08-29 22:50:26 - pico-train - INFO - Batch Size Configuration:
+2025-08-29 22:50:26 - pico-train - INFO - └─ Global Batch Size: 4
+2025-08-29 22:50:26 - pico-train - INFO - └─ Per Device Batch Size: 1
+2025-08-29 22:50:26 - pico-train - INFO - └─ Gradient Accumulation Steps: 4
+2025-08-29 22:50:26 - pico-train - INFO - ==================================================
+2025-08-29 22:50:27 - pico-train - INFO - Step 20000 -- ���� Training Metrics
+2025-08-29 22:50:27 - pico-train - INFO - ├── Loss: 6.5103
+2025-08-29 22:50:27 - pico-train - INFO - ├── Learning Rate: 5.00e-06
+2025-08-29 22:50:27 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-29 22:50:27 - pico-train - INFO - Step 20000 -- 📈 Saving Learning Dynamics
+2025-08-29 22:50:43 - pico-train - INFO - Step 20025 -- 🔄 Training Metrics
+2025-08-29 22:50:43 - pico-train - INFO - ├── Loss: 6.4274
+2025-08-29 22:50:43 - pico-train - INFO - ├── Learning Rate: 3.45e-05
+2025-08-29 22:50:43 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-29 22:50:55 - pico-train - INFO - Step 20050 -- 🔄 Training Metrics
+2025-08-29 22:50:55 - pico-train - INFO - ├── Loss: 6.3770
+2025-08-29 22:50:55 - pico-train - INFO - ├── Learning Rate: 3.45e-05
+2025-08-29 22:50:55 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-29 22:51:08 - pico-train - INFO - Step 20075 -- 🔄 Training Metrics
+2025-08-29 22:51:08 - pico-train - INFO - ├── Loss: 6.2797
+2025-08-29 22:51:08 - pico-train - INFO - ├── Learning Rate: 3.44e-05
+2025-08-29 22:51:08 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-29 22:51:21 - pico-train - INFO - Step 20100 -- 🔄 Training Metrics
+2025-08-29 22:51:21 - pico-train - INFO - ├── Loss: 6.3924
+2025-08-29 22:51:21 - pico-train - INFO - ├── Learning Rate: 3.43e-05
+2025-08-29 22:51:21 - pico-train - INFO - └── Inf/NaN count: 0
+2025-08-29 22:51:34 - pico-train - INFO - Step 20125 -- 🔄 Training Metrics
+2025-08-29 22:51:34 - pico-train - INFO - ├── Loss: 6.4442
+2025-08-29 22:51:34 - pico-train - INFO - ├── Learning Rate: 3.43e-05
+2025-08-29 22:51:34 - pico-train - INFO - └── Inf/NaN count: 0