ThomasTheMaker commited on
Commit
b5f841f
ยท
verified ยท
1 Parent(s): c6ae8e9

Upload folder using huggingface_hub

Browse files
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/config.json CHANGED
@@ -17,6 +17,6 @@
17
  "norm_eps": 1e-06,
18
  "position_emb_theta": 10000.0,
19
  "torch_dtype": "float32",
20
- "transformers_version": "4.48.3",
21
  "vocab_size": 50304
22
  }
 
17
  "norm_eps": 1e-06,
18
  "position_emb_theta": 10000.0,
19
  "torch_dtype": "float32",
20
+ "transformers_version": "4.55.4",
21
  "vocab_size": 50304
22
  }
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/fabric_state/checkpoint.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3add1b50ed75802e72e92667b9a06fb9799a3a0fc8a12baa5bb43d4c1941a2ab
3
  size 135543171
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd044dad1851e94ba0ceb88090254ed7a248d8d3119d6688a2c7dd15e3b7865
3
  size 135543171
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/generation_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "transformers_version": "4.48.3",
3
  "vocab_size": 50304
4
  }
 
1
  {
2
+ "transformers_version": "4.55.4",
3
  "vocab_size": 50304
4
  }
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_activations.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8910ad571404374d5d26adf3151cf7749b571908b14ece94076013fc2ced8b92
3
+ size 33819
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:254891345b1a9d809e9c5c0a1532693b94d769025a317ad82bc418dfa3f7b40b
3
+ size 71640
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/dataset_info.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "input_ids": {
6
+ "feature": {
7
+ "dtype": "int32",
8
+ "_type": "Value"
9
+ },
10
+ "_type": "Sequence"
11
+ },
12
+ "text": {
13
+ "dtype": "string",
14
+ "_type": "Value"
15
+ }
16
+ },
17
+ "homepage": "",
18
+ "license": ""
19
+ }
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "3da9a89786e6494d",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_gradients.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d70f9fca27cf0bed975a63ab4f6cf4925792f880630b8d3da7180c89deadb4f
3
+ size 2371527
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa1c848f47e8f22b70e5ae91c318ab8432a3b05f4456b771304768c0a0c42431
3
+ size 2371443
pico-decoder-tiny-dolma5M-v1/logs/log_20250829_224829.log ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-08-29 22:50:26 - pico-train - INFO - Step 20000 -- ๐Ÿ“Š Evaluation Results
2
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€โ”€ paloma: 1.8399778163273925e+24
3
+ 2025-08-29 22:50:26 - pico-train - INFO - ==================================================
4
+ 2025-08-29 22:50:26 - pico-train - INFO - โœจ Training Configuration
5
+ 2025-08-29 22:50:26 - pico-train - INFO - ==================================================
6
+ 2025-08-29 22:50:26 - pico-train - INFO - โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ
7
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ checkpointing: โ”‚
8
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ checkpoints_dir: checkpoints โ”‚
9
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ evaluation: โ”‚
10
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ eval_results_dir: eval_results โ”‚
11
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ fabric_checkpoint_dir: fabric_state โ”‚
12
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ fabric_checkpoint_filename: checkpoint.pt โ”‚
13
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ hf_checkpoint: โ”‚
14
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ collection_slug: null โ”‚
15
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ repo_id: ThomasTheMaker/pico-decoder-tiny โ”‚
16
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ learning_dynamics: โ”‚
17
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ batch_size: 1 โ”‚
18
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ eval_data: null โ”‚
19
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ layer_suffixes: โ”‚
20
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ - attention.v_proj โ”‚
21
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ - attention.o_proj โ”‚
22
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ - swiglu.w_2 โ”‚
23
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ sequence_idx: -1 โ”‚
24
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ learning_dynamics_dir: learning_dynamics โ”‚
25
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ logs_dir: logs โ”‚
26
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ run_name: pico-decoder-tiny-dolma5M-v1 โ”‚
27
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ runs_dir: runs โ”‚
28
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ save_every_n_steps: 500 โ”‚
29
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ save_to_hf: true โ”‚
30
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ training: โ”‚
31
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ auto_resume: true โ”‚
32
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ data: โ”‚
33
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ dataloader: โ”‚
34
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ batch_size: 4 โ”‚
35
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ dataset: โ”‚
36
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ name: ThomasTheMaker/pretokenized-dolma-5M โ”‚
37
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ tokenizer: โ”‚
38
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ name: allenai/OLMo-7B-0724-hf โ”‚
39
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ vocab_size: 50304 โ”‚
40
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ evaluation: โ”‚
41
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ metrics: โ”‚
42
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ - paloma โ”‚
43
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ paloma: โ”‚
44
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ batch_size: 1 โ”‚
45
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ dataset_name: pico-lm/pretokenized-paloma-tinsy โ”‚
46
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ dataset_split: val โ”‚
47
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ max_length: 2048 โ”‚
48
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ model: โ”‚
49
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ activation_hidden_dim: 384 โ”‚
50
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ attention_n_heads: 12 โ”‚
51
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ attention_n_kv_heads: 4 โ”‚
52
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ batch_size: 1024 โ”‚
53
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ d_model: 96 โ”‚
54
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ max_seq_len: 2048 โ”‚
55
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ model_type: pico_decoder โ”‚
56
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ n_layers: 12 โ”‚
57
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ norm_eps: 1.0e-06 โ”‚
58
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ position_emb_theta: 10000.0 โ”‚
59
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ vocab_size: 50304 โ”‚
60
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ monitoring: โ”‚
61
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ logging: โ”‚
62
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ log_every_n_steps: 25 โ”‚
63
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ log_level: INFO โ”‚
64
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ save_to_wandb: false โ”‚
65
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ wandb: โ”‚
66
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ entity: boymyc โ”‚
67
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ project: pico-decoder-tiny โ”‚
68
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ training: โ”‚
69
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ fabric: โ”‚
70
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ accelerator: cuda โ”‚
71
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ num_devices: 1 โ”‚
72
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ num_nodes: 1 โ”‚
73
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ precision: bf16-mixed โ”‚
74
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ max_steps: 20000 โ”‚
75
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ optimization: โ”‚
76
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ gradient_accumulation_steps: 4 โ”‚
77
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ lr: 5.0e-05 โ”‚
78
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ lr_scheduler: cosine โ”‚
79
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ lr_warmup_steps: 8000 โ”‚
80
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ optimizer: adamw โ”‚
81
+ 2025-08-29 22:50:26 - pico-train - INFO - โ”‚ โ”‚
82
+ 2025-08-29 22:50:26 - pico-train - INFO - โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ
83
+ 2025-08-29 22:50:26 - pico-train - INFO - ==================================================
84
+ 2025-08-29 22:50:26 - pico-train - INFO - โ›ญ Runtime Summary:
85
+ 2025-08-29 22:50:26 - pico-train - INFO - ==================================================
86
+ 2025-08-29 22:50:26 - pico-train - INFO - Starting from step: 20000
87
+ 2025-08-29 22:50:26 - pico-train - INFO - Model Setup:
88
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Total Parameters: 11,282,784
89
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Trainable Parameters: 11,282,784
90
+ 2025-08-29 22:50:26 - pico-train - INFO - Distributed Setup:
91
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Number of Devices: 1
92
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Device Type: NVIDIA GeForce RTX 5090
93
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Available Memory: 33.68 GB
94
+ 2025-08-29 22:50:26 - pico-train - INFO - Software Setup:
95
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Python Version: 3.10.12
96
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ PyTorch Version: 2.8.0+cu128
97
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ CUDA Version: 12.8
98
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Operating System: Linux 6.8.0-63-generic
99
+ 2025-08-29 22:50:26 - pico-train - INFO - Batch Size Configuration:
100
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Global Batch Size: 4
101
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Per Device Batch Size: 1
102
+ 2025-08-29 22:50:26 - pico-train - INFO - โ””โ”€ Gradient Accumulation Steps: 4
103
+ 2025-08-29 22:50:26 - pico-train - INFO - ==================================================
104
+ 2025-08-29 22:50:27 - pico-train - INFO - Step 20000 -- ๏ฟฝ๏ฟฝ๏ฟฝ๏ฟฝ Training Metrics
105
+ 2025-08-29 22:50:27 - pico-train - INFO - โ”œโ”€โ”€ Loss: 6.5103
106
+ 2025-08-29 22:50:27 - pico-train - INFO - โ”œโ”€โ”€ Learning Rate: 5.00e-06
107
+ 2025-08-29 22:50:27 - pico-train - INFO - โ””โ”€โ”€ Inf/NaN count: 0
108
+ 2025-08-29 22:50:27 - pico-train - INFO - Step 20000 -- ๐Ÿ“ˆ Saving Learning Dynamics
109
+ 2025-08-29 22:50:43 - pico-train - INFO - Step 20025 -- ๐Ÿ”„ Training Metrics
110
+ 2025-08-29 22:50:43 - pico-train - INFO - โ”œโ”€โ”€ Loss: 6.4274
111
+ 2025-08-29 22:50:43 - pico-train - INFO - โ”œโ”€โ”€ Learning Rate: 3.45e-05
112
+ 2025-08-29 22:50:43 - pico-train - INFO - โ””โ”€โ”€ Inf/NaN count: 0
113
+ 2025-08-29 22:50:55 - pico-train - INFO - Step 20050 -- ๐Ÿ”„ Training Metrics
114
+ 2025-08-29 22:50:55 - pico-train - INFO - โ”œโ”€โ”€ Loss: 6.3770
115
+ 2025-08-29 22:50:55 - pico-train - INFO - โ”œโ”€โ”€ Learning Rate: 3.45e-05
116
+ 2025-08-29 22:50:55 - pico-train - INFO - โ””โ”€โ”€ Inf/NaN count: 0
117
+ 2025-08-29 22:51:08 - pico-train - INFO - Step 20075 -- ๐Ÿ”„ Training Metrics
118
+ 2025-08-29 22:51:08 - pico-train - INFO - โ”œโ”€โ”€ Loss: 6.2797
119
+ 2025-08-29 22:51:08 - pico-train - INFO - โ”œโ”€โ”€ Learning Rate: 3.44e-05
120
+ 2025-08-29 22:51:08 - pico-train - INFO - โ””โ”€โ”€ Inf/NaN count: 0
121
+ 2025-08-29 22:51:21 - pico-train - INFO - Step 20100 -- ๐Ÿ”„ Training Metrics
122
+ 2025-08-29 22:51:21 - pico-train - INFO - โ”œโ”€โ”€ Loss: 6.3924
123
+ 2025-08-29 22:51:21 - pico-train - INFO - โ”œโ”€โ”€ Learning Rate: 3.43e-05
124
+ 2025-08-29 22:51:21 - pico-train - INFO - โ””โ”€โ”€ Inf/NaN count: 0
125
+ 2025-08-29 22:51:34 - pico-train - INFO - Step 20125 -- ๐Ÿ”„ Training Metrics
126
+ 2025-08-29 22:51:34 - pico-train - INFO - โ”œโ”€โ”€ Loss: 6.4442
127
+ 2025-08-29 22:51:34 - pico-train - INFO - โ”œโ”€โ”€ Learning Rate: 3.43e-05
128
+ 2025-08-29 22:51:34 - pico-train - INFO - โ””โ”€โ”€ Inf/NaN count: 0