Upload folder using huggingface_hub
Browse files- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/config.json +1 -1
- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/fabric_state/checkpoint.pt +1 -1
- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/generation_config.json +1 -1
- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_activations.pt +3 -0
- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/data-00000-of-00001.arrow +3 -0
- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/dataset_info.json +19 -0
- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/state.json +13 -0
- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_gradients.pt +3 -0
- pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_weights.pt +3 -0
- pico-decoder-tiny-dolma5M-v1/logs/log_20250829_224829.log +128 -0
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/config.json
CHANGED
@@ -17,6 +17,6 @@
|
|
17 |
"norm_eps": 1e-06,
|
18 |
"position_emb_theta": 10000.0,
|
19 |
"torch_dtype": "float32",
|
20 |
-
"transformers_version": "4.
|
21 |
"vocab_size": 50304
|
22 |
}
|
|
|
17 |
"norm_eps": 1e-06,
|
18 |
"position_emb_theta": 10000.0,
|
19 |
"torch_dtype": "float32",
|
20 |
+
"transformers_version": "4.55.4",
|
21 |
"vocab_size": 50304
|
22 |
}
|
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/fabric_state/checkpoint.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 135543171
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8fd044dad1851e94ba0ceb88090254ed7a248d8d3119d6688a2c7dd15e3b7865
|
3 |
size 135543171
|
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/generation_config.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
{
|
2 |
-
"transformers_version": "4.
|
3 |
"vocab_size": 50304
|
4 |
}
|
|
|
1 |
{
|
2 |
+
"transformers_version": "4.55.4",
|
3 |
"vocab_size": 50304
|
4 |
}
|
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_activations.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8910ad571404374d5d26adf3151cf7749b571908b14ece94076013fc2ced8b92
|
3 |
+
size 33819
|
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:254891345b1a9d809e9c5c0a1532693b94d769025a317ad82bc418dfa3f7b40b
|
3 |
+
size 71640
|
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/dataset_info.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"citation": "",
|
3 |
+
"description": "",
|
4 |
+
"features": {
|
5 |
+
"input_ids": {
|
6 |
+
"feature": {
|
7 |
+
"dtype": "int32",
|
8 |
+
"_type": "Value"
|
9 |
+
},
|
10 |
+
"_type": "Sequence"
|
11 |
+
},
|
12 |
+
"text": {
|
13 |
+
"dtype": "string",
|
14 |
+
"_type": "Value"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"homepage": "",
|
18 |
+
"license": ""
|
19 |
+
}
|
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_data/state.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "3da9a89786e6494d",
|
8 |
+
"_format_columns": null,
|
9 |
+
"_format_kwargs": {},
|
10 |
+
"_format_type": null,
|
11 |
+
"_output_all_columns": false,
|
12 |
+
"_split": null
|
13 |
+
}
|
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_gradients.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d70f9fca27cf0bed975a63ab4f6cf4925792f880630b8d3da7180c89deadb4f
|
3 |
+
size 2371527
|
pico-decoder-tiny-dolma5M-v1/checkpoints/step_20000/learning_dynamics/train_weights.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa1c848f47e8f22b70e5ae91c318ab8432a3b05f4456b771304768c0a0c42431
|
3 |
+
size 2371443
|
pico-decoder-tiny-dolma5M-v1/logs/log_20250829_224829.log
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-08-29 22:50:26 - pico-train - INFO - Step 20000 -- ๐ Evaluation Results
|
2 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโโ paloma: 1.8399778163273925e+24
|
3 |
+
2025-08-29 22:50:26 - pico-train - INFO - ==================================================
|
4 |
+
2025-08-29 22:50:26 - pico-train - INFO - โจ Training Configuration
|
5 |
+
2025-08-29 22:50:26 - pico-train - INFO - ==================================================
|
6 |
+
2025-08-29 22:50:26 - pico-train - INFO - โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
|
7 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ checkpointing: โ
|
8 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ checkpoints_dir: checkpoints โ
|
9 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ evaluation: โ
|
10 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ eval_results_dir: eval_results โ
|
11 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ fabric_checkpoint_dir: fabric_state โ
|
12 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ fabric_checkpoint_filename: checkpoint.pt โ
|
13 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ hf_checkpoint: โ
|
14 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ collection_slug: null โ
|
15 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ repo_id: ThomasTheMaker/pico-decoder-tiny โ
|
16 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ learning_dynamics: โ
|
17 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ batch_size: 1 โ
|
18 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ eval_data: null โ
|
19 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ layer_suffixes: โ
|
20 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ - attention.v_proj โ
|
21 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ - attention.o_proj โ
|
22 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ - swiglu.w_2 โ
|
23 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ sequence_idx: -1 โ
|
24 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ learning_dynamics_dir: learning_dynamics โ
|
25 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ logs_dir: logs โ
|
26 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ run_name: pico-decoder-tiny-dolma5M-v1 โ
|
27 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ runs_dir: runs โ
|
28 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ save_every_n_steps: 500 โ
|
29 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ save_to_hf: true โ
|
30 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ training: โ
|
31 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ auto_resume: true โ
|
32 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ data: โ
|
33 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ dataloader: โ
|
34 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ batch_size: 4 โ
|
35 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ dataset: โ
|
36 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ name: ThomasTheMaker/pretokenized-dolma-5M โ
|
37 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ tokenizer: โ
|
38 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ name: allenai/OLMo-7B-0724-hf โ
|
39 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ vocab_size: 50304 โ
|
40 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ evaluation: โ
|
41 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ metrics: โ
|
42 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ - paloma โ
|
43 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ paloma: โ
|
44 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ batch_size: 1 โ
|
45 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ dataset_name: pico-lm/pretokenized-paloma-tinsy โ
|
46 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ dataset_split: val โ
|
47 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ max_length: 2048 โ
|
48 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ model: โ
|
49 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ activation_hidden_dim: 384 โ
|
50 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ attention_n_heads: 12 โ
|
51 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ attention_n_kv_heads: 4 โ
|
52 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ batch_size: 1024 โ
|
53 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ d_model: 96 โ
|
54 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ max_seq_len: 2048 โ
|
55 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ model_type: pico_decoder โ
|
56 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ n_layers: 12 โ
|
57 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ norm_eps: 1.0e-06 โ
|
58 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ position_emb_theta: 10000.0 โ
|
59 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ vocab_size: 50304 โ
|
60 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ monitoring: โ
|
61 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ logging: โ
|
62 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ log_every_n_steps: 25 โ
|
63 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ log_level: INFO โ
|
64 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ save_to_wandb: false โ
|
65 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ wandb: โ
|
66 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ entity: boymyc โ
|
67 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ project: pico-decoder-tiny โ
|
68 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ training: โ
|
69 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ fabric: โ
|
70 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ accelerator: cuda โ
|
71 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ num_devices: 1 โ
|
72 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ num_nodes: 1 โ
|
73 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ precision: bf16-mixed โ
|
74 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ max_steps: 20000 โ
|
75 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ optimization: โ
|
76 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ gradient_accumulation_steps: 4 โ
|
77 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ lr: 5.0e-05 โ
|
78 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ lr_scheduler: cosine โ
|
79 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ lr_warmup_steps: 8000 โ
|
80 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ optimizer: adamw โ
|
81 |
+
2025-08-29 22:50:26 - pico-train - INFO - โ โ
|
82 |
+
2025-08-29 22:50:26 - pico-train - INFO - โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
|
83 |
+
2025-08-29 22:50:26 - pico-train - INFO - ==================================================
|
84 |
+
2025-08-29 22:50:26 - pico-train - INFO - โญ Runtime Summary:
|
85 |
+
2025-08-29 22:50:26 - pico-train - INFO - ==================================================
|
86 |
+
2025-08-29 22:50:26 - pico-train - INFO - Starting from step: 20000
|
87 |
+
2025-08-29 22:50:26 - pico-train - INFO - Model Setup:
|
88 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Total Parameters: 11,282,784
|
89 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Trainable Parameters: 11,282,784
|
90 |
+
2025-08-29 22:50:26 - pico-train - INFO - Distributed Setup:
|
91 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Number of Devices: 1
|
92 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Device Type: NVIDIA GeForce RTX 5090
|
93 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Available Memory: 33.68 GB
|
94 |
+
2025-08-29 22:50:26 - pico-train - INFO - Software Setup:
|
95 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Python Version: 3.10.12
|
96 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ PyTorch Version: 2.8.0+cu128
|
97 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ CUDA Version: 12.8
|
98 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Operating System: Linux 6.8.0-63-generic
|
99 |
+
2025-08-29 22:50:26 - pico-train - INFO - Batch Size Configuration:
|
100 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Global Batch Size: 4
|
101 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Per Device Batch Size: 1
|
102 |
+
2025-08-29 22:50:26 - pico-train - INFO - โโ Gradient Accumulation Steps: 4
|
103 |
+
2025-08-29 22:50:26 - pico-train - INFO - ==================================================
|
104 |
+
2025-08-29 22:50:27 - pico-train - INFO - Step 20000 -- ๏ฟฝ๏ฟฝ๏ฟฝ๏ฟฝ Training Metrics
|
105 |
+
2025-08-29 22:50:27 - pico-train - INFO - โโโ Loss: 6.5103
|
106 |
+
2025-08-29 22:50:27 - pico-train - INFO - โโโ Learning Rate: 5.00e-06
|
107 |
+
2025-08-29 22:50:27 - pico-train - INFO - โโโ Inf/NaN count: 0
|
108 |
+
2025-08-29 22:50:27 - pico-train - INFO - Step 20000 -- ๐ Saving Learning Dynamics
|
109 |
+
2025-08-29 22:50:43 - pico-train - INFO - Step 20025 -- ๐ Training Metrics
|
110 |
+
2025-08-29 22:50:43 - pico-train - INFO - โโโ Loss: 6.4274
|
111 |
+
2025-08-29 22:50:43 - pico-train - INFO - โโโ Learning Rate: 3.45e-05
|
112 |
+
2025-08-29 22:50:43 - pico-train - INFO - โโโ Inf/NaN count: 0
|
113 |
+
2025-08-29 22:50:55 - pico-train - INFO - Step 20050 -- ๐ Training Metrics
|
114 |
+
2025-08-29 22:50:55 - pico-train - INFO - โโโ Loss: 6.3770
|
115 |
+
2025-08-29 22:50:55 - pico-train - INFO - โโโ Learning Rate: 3.45e-05
|
116 |
+
2025-08-29 22:50:55 - pico-train - INFO - โโโ Inf/NaN count: 0
|
117 |
+
2025-08-29 22:51:08 - pico-train - INFO - Step 20075 -- ๐ Training Metrics
|
118 |
+
2025-08-29 22:51:08 - pico-train - INFO - โโโ Loss: 6.2797
|
119 |
+
2025-08-29 22:51:08 - pico-train - INFO - โโโ Learning Rate: 3.44e-05
|
120 |
+
2025-08-29 22:51:08 - pico-train - INFO - โโโ Inf/NaN count: 0
|
121 |
+
2025-08-29 22:51:21 - pico-train - INFO - Step 20100 -- ๐ Training Metrics
|
122 |
+
2025-08-29 22:51:21 - pico-train - INFO - โโโ Loss: 6.3924
|
123 |
+
2025-08-29 22:51:21 - pico-train - INFO - โโโ Learning Rate: 3.43e-05
|
124 |
+
2025-08-29 22:51:21 - pico-train - INFO - โโโ Inf/NaN count: 0
|
125 |
+
2025-08-29 22:51:34 - pico-train - INFO - Step 20125 -- ๐ Training Metrics
|
126 |
+
2025-08-29 22:51:34 - pico-train - INFO - โโโ Loss: 6.4442
|
127 |
+
2025-08-29 22:51:34 - pico-train - INFO - โโโ Learning Rate: 3.43e-05
|
128 |
+
2025-08-29 22:51:34 - pico-train - INFO - โโโ Inf/NaN count: 0
|