DylanJHJ commited on
Commit
e218fa7
·
1 Parent(s): 4201aa8

update model ablation for smapling

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +44 -0
  2. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/config.json +45 -0
  3. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/model.safetensors +3 -0
  4. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt +3 -0
  5. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth +3 -0
  6. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth +3 -0
  7. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth +3 -0
  8. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth +3 -0
  9. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt +3 -0
  10. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json +0 -0
  11. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/training_args.bin +3 -0
  12. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/config.json +45 -0
  13. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/model.safetensors +3 -0
  14. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/optimizer.pt +3 -0
  15. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_0.pth +3 -0
  16. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_1.pth +3 -0
  17. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_2.pth +3 -0
  18. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_3.pth +3 -0
  19. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/scheduler.pt +3 -0
  20. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/trainer_state.json +2109 -0
  21. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/training_args.bin +3 -0
  22. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/config.json +45 -0
  23. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/model.safetensors +3 -0
  24. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt +3 -0
  25. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth +3 -0
  26. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth +3 -0
  27. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth +3 -0
  28. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth +3 -0
  29. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt +3 -0
  30. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json +4184 -0
  31. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/training_args.bin +3 -0
  32. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/config.json +45 -0
  33. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/model.safetensors +3 -0
  34. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/optimizer.pt +3 -0
  35. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_0.pth +3 -0
  36. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_1.pth +3 -0
  37. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_2.pth +3 -0
  38. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_3.pth +3 -0
  39. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/scheduler.pt +3 -0
  40. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/trainer_state.json +0 -0
  41. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/training_args.bin +3 -0
  42. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/config.json +45 -0
  43. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/model.safetensors +3 -0
  44. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/optimizer.pt +3 -0
  45. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_0.pth +3 -0
  46. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_1.pth +3 -0
  47. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_2.pth +3 -0
  48. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_3.pth +3 -0
  49. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/runs/Nov25_13-58-36_nid005118/events.out.tfevents.1764072154.nid005118.9241.0 +3 -0
  50. modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/scheduler.pt +3 -0
.gitattributes CHANGED
@@ -63,3 +63,47 @@ modernbert-crux-researchy-pos_high.neg_zero.b64_n512.1e-4.512/checkpoint-5000/mo
63
  modernbert-crux-researchy-pos_half.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
64
  modernbert-crux-researchy-pos_low.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
65
  modernbert-crux-researchy-pos_high.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  modernbert-crux-researchy-pos_half.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
64
  modernbert-crux-researchy-pos_low.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
65
  modernbert-crux-researchy-pos_high.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
66
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
67
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/optimizer.pt filter=lfs diff=lfs merge=lfs -text
68
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/model.safetensors filter=lfs diff=lfs merge=lfs -text
69
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/model.safetensors filter=lfs diff=lfs merge=lfs -text
70
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/optimizer.pt filter=lfs diff=lfs merge=lfs -text
71
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/model.safetensors filter=lfs diff=lfs merge=lfs -text
72
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
73
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/model.safetensors filter=lfs diff=lfs merge=lfs -text
74
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
75
+ modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/model.safetensors filter=lfs diff=lfs merge=lfs -text
76
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
77
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-7500/optimizer.pt filter=lfs diff=lfs merge=lfs -text
78
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-7500/model.safetensors filter=lfs diff=lfs merge=lfs -text
79
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/model.safetensors filter=lfs diff=lfs merge=lfs -text
80
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-2500/optimizer.pt filter=lfs diff=lfs merge=lfs -text
81
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-2500/model.safetensors filter=lfs diff=lfs merge=lfs -text
82
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
83
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-10000/model.safetensors filter=lfs diff=lfs merge=lfs -text
84
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
85
+ modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-5000/model.safetensors filter=lfs diff=lfs merge=lfs -text
86
+ .git/lfs/objects/ce/70/ce70baabd51a179ecf81d62946475b863fa7b485ef3208ae2c04b60c275a2a96 filter=lfs diff=lfs merge=lfs -text
87
+ .git/lfs/objects/a4/b6/a4b6b75bc77beffc95b5d6c546e916043aee3af38d55035c9f51ebbba73c6b33 filter=lfs diff=lfs merge=lfs -text
88
+ .git/lfs/objects/c7/f2/c7f26a5b35e56ecb6bb72212ce3a7da9d14d66de5114f67202346c60edd18426 filter=lfs diff=lfs merge=lfs -text
89
+ .git/lfs/objects/26/fa/26faaeca5a59a77444ef3f16961df2c4b1b3c298a93cb53f21213fbab0ade4be filter=lfs diff=lfs merge=lfs -text
90
+ .git/lfs/objects/26/dd/26ddea10d1b592b763ee59b05f1558c3cb2f0493c1b214f3c80feb714b66c3b1 filter=lfs diff=lfs merge=lfs -text
91
+ .git/lfs/objects/83/6c/836c84924653789c9c3b731d22335a04af06d4df529543f7e65b0db31889ea74 filter=lfs diff=lfs merge=lfs -text
92
+ .git/lfs/objects/88/92/88926b3be5727f9d0cc432ab2c00dfc3cab66da501336e8c8a7e08188cc0de25 filter=lfs diff=lfs merge=lfs -text
93
+ .git/lfs/objects/1b/d3/1bd3c6439961fc549637f6cc43348eb9fbd2080ab2266164dcd783f7c3699e7a filter=lfs diff=lfs merge=lfs -text
94
+ .git/lfs/objects/22/3b/223b60387aa7ede0f506e51bf9e672bf2b2acae07c62b17cbf746a0246ac1da8 filter=lfs diff=lfs merge=lfs -text
95
+ .git/lfs/objects/1a/ff/1aff66ee140ada517137e3deb00c2322cd5f3c5c26e947f38a24227e394683ac filter=lfs diff=lfs merge=lfs -text
96
+ .git/lfs/objects/8b/3f/8b3f482e56a92f0760e27f700de46d09e3a07b0a89d3ef90808d0d6b22512827 filter=lfs diff=lfs merge=lfs -text
97
+ .git/lfs/objects/78/03/780374ad1143b7bbe5b73f25a7260aa5063a103a1169b688b4285124dcb04dcc filter=lfs diff=lfs merge=lfs -text
98
+ .git/lfs/objects/6d/76/6d761d2f71db6d9137a94a442e6e58d9d7a63ac279a004e02afd5fe416791c0f filter=lfs diff=lfs merge=lfs -text
99
+ .git/lfs/objects/5b/7d/5b7de0c3bda08d9fbc23706080095b6e9f2e2204ff73583266d980bde7676f45 filter=lfs diff=lfs merge=lfs -text
100
+ .git/lfs/objects/c9/ce/c9cef30b887e6bf1661270424c3865351158cce8f23f20d0314c9463455010f0 filter=lfs diff=lfs merge=lfs -text
101
+ .git/lfs/objects/d4/c8/d4c81558929d61f7e09725cdd0299c7c9a60354e8e937058fa212bd0fed0d5dc filter=lfs diff=lfs merge=lfs -text
102
+ .git/lfs/objects/2d/ca/2dca16734197d014f2cfd5f2b53a60d6c915ae69dcfc3813fbe9819c346e5316 filter=lfs diff=lfs merge=lfs -text
103
+ .git/lfs/objects/06/8a/068adc19df70de3d48c90e1a4bfee35d3260dcecdfab93d036098987af28c762 filter=lfs diff=lfs merge=lfs -text
104
+ .git/lfs/objects/5e/7c/5e7c5f3f7ec54eb7a6f288a9c3b9a239b5f1a88679a0d4ced6160c6bcd8b5c1f filter=lfs diff=lfs merge=lfs -text
105
+ .git/lfs/objects/56/00/560001d7760823f163c427fee75d58d3d89e41c166b9cbba91ea27edc32f94e1 filter=lfs diff=lfs merge=lfs -text
106
+ .git/lfs/objects/f0/4c/f04c2e8ee9c6967f1709716e47b4464861ad1aeb50641b3b1dfbe87cb8894d58 filter=lfs diff=lfs merge=lfs -text
107
+ .git/lfs/objects/f0/69/f069e96d36df3a41005ecc6796d5ed8429bdffa25d9e9d133e6191cbf90b3de7 filter=lfs diff=lfs merge=lfs -text
108
+ .git/lfs/objects/65/f2/65f21ce22f521a5d6f2e5456473256b7984e5c46122adc14ec92997d9ca30764 filter=lfs diff=lfs merge=lfs -text
109
+ .git/lfs/objects/fd/d7/fdd7dd2e666a3126c4ed005d62b97fc63ca51fad19f7d480fb83d6eac7bfbbd0 filter=lfs diff=lfs merge=lfs -text
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa648019534a54da16e5df11fb28257398ac4ee886de2d2ef90e587b14a698f7
3
+ size 298041696
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a47b56f6d386053d17cfa1c908b16dcebcec2fa8dbf6ea679e0add277be30b3
3
+ size 596170443
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c235c10397ca3fb3b82475883c48d3bb786206feaee53c2199c913179faf1fb
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:937bfac24cd2fe886a72cb180e9d726f8629acaf1e31b2beab1f7a03381ca0ca
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee0687693332dd9f28a675c2a9f27590ae650095d80dac61354fce4437e7f9de
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffb4dab4ba8c60d5f5c48a1048c1ecc4e949aff462fd8340d7ad1a380fc12fdd
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d02be6d8bda4ea9c67040ed89f878acdc986bd4df3fbc60440a9d3eacca02d63
3
+ size 1465
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b297d5e64c898db22dbb0b2c7feb17b604dfbcc3bfebf55e9e7ecbf9c3794c
3
+ size 6161
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e030bc5d9a25e135113d1a5b5746b18f93841dc8941845cf34057ac91120ef2f
3
+ size 298041696
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:560363724cc297d305f7f508a44b62eb6e5b3c38cc93a253e113fc093e5591d2
3
+ size 596170443
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edd5aaa88c9c1fc8abf11cf4397d5571cd01f3d7b0f19ae2e2d129014be1fa8a
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf8a2a24c0dde4941747f29f745d706701a3d3b8edb14d342e599b750fa7e64
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bbd2a817215a6f90c3632eb4c1cf3c7a57ed52c41b60ddeaaeaa878bfb142a5
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:019accfa5df5627ce0019fe3ba9da9a9010bf5c682c7b0defe46d07366d9649b
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a752d3f24f72817376cd37ffda577fd802575961fb476ede3db67c3cc89113bf
3
+ size 1465
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/trainer_state.json ADDED
@@ -0,0 +1,2109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.1133250311332503,
6
+ "eval_steps": 100,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.012453300124533,
14
+ "grad_norm": 15.75,
15
+ "learning_rate": 9e-07,
16
+ "loss": 5.5753,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.024906600249066,
21
+ "grad_norm": 25.25,
22
+ "learning_rate": 1.9e-06,
23
+ "loss": 5.5776,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.037359900373599,
28
+ "grad_norm": 16.25,
29
+ "learning_rate": 2.9e-06,
30
+ "loss": 5.5572,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.049813200498132,
35
+ "grad_norm": 23.5,
36
+ "learning_rate": 3.9e-06,
37
+ "loss": 5.5201,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.062266500622665005,
42
+ "grad_norm": 18.0,
43
+ "learning_rate": 4.9000000000000005e-06,
44
+ "loss": 5.6297,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.074719800747198,
49
+ "grad_norm": 19.125,
50
+ "learning_rate": 5.9e-06,
51
+ "loss": 5.5889,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.08717310087173101,
56
+ "grad_norm": 23.125,
57
+ "learning_rate": 6.900000000000001e-06,
58
+ "loss": 5.4949,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.099626400996264,
63
+ "grad_norm": 20.25,
64
+ "learning_rate": 7.9e-06,
65
+ "loss": 5.552,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.11207970112079702,
70
+ "grad_norm": 17.0,
71
+ "learning_rate": 8.9e-06,
72
+ "loss": 5.4765,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.12453300124533001,
77
+ "grad_norm": 20.125,
78
+ "learning_rate": 9.900000000000002e-06,
79
+ "loss": 5.4519,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.12453300124533001,
84
+ "eval/acc": 2.3255813121795654,
85
+ "step": 100
86
+ },
87
+ {
88
+ "epoch": 0.12453300124533001,
89
+ "eval_loss": 4.970202445983887,
90
+ "eval_runtime": 2.822,
91
+ "eval_samples_per_second": 15.237,
92
+ "eval_steps_per_second": 0.354,
93
+ "step": 100
94
+ },
95
+ {
96
+ "epoch": 0.136986301369863,
97
+ "grad_norm": 22.125,
98
+ "learning_rate": 1.09e-05,
99
+ "loss": 5.3401,
100
+ "step": 110
101
+ },
102
+ {
103
+ "epoch": 0.149439601494396,
104
+ "grad_norm": 16.25,
105
+ "learning_rate": 1.19e-05,
106
+ "loss": 5.3088,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 0.16189290161892902,
111
+ "grad_norm": 18.75,
112
+ "learning_rate": 1.29e-05,
113
+ "loss": 5.1442,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.17434620174346202,
118
+ "grad_norm": 19.5,
119
+ "learning_rate": 1.3900000000000002e-05,
120
+ "loss": 5.0218,
121
+ "step": 140
122
+ },
123
+ {
124
+ "epoch": 0.18679950186799502,
125
+ "grad_norm": 25.75,
126
+ "learning_rate": 1.49e-05,
127
+ "loss": 4.8711,
128
+ "step": 150
129
+ },
130
+ {
131
+ "epoch": 0.199252801992528,
132
+ "grad_norm": 25.625,
133
+ "learning_rate": 1.59e-05,
134
+ "loss": 4.6046,
135
+ "step": 160
136
+ },
137
+ {
138
+ "epoch": 0.21170610211706103,
139
+ "grad_norm": 28.25,
140
+ "learning_rate": 1.69e-05,
141
+ "loss": 4.2891,
142
+ "step": 170
143
+ },
144
+ {
145
+ "epoch": 0.22415940224159403,
146
+ "grad_norm": 25.25,
147
+ "learning_rate": 1.79e-05,
148
+ "loss": 3.8055,
149
+ "step": 180
150
+ },
151
+ {
152
+ "epoch": 0.23661270236612703,
153
+ "grad_norm": 28.0,
154
+ "learning_rate": 1.8900000000000002e-05,
155
+ "loss": 3.4139,
156
+ "step": 190
157
+ },
158
+ {
159
+ "epoch": 0.24906600249066002,
160
+ "grad_norm": 29.5,
161
+ "learning_rate": 1.9900000000000003e-05,
162
+ "loss": 2.974,
163
+ "step": 200
164
+ },
165
+ {
166
+ "epoch": 0.24906600249066002,
167
+ "eval/acc": 11.627906799316406,
168
+ "step": 200
169
+ },
170
+ {
171
+ "epoch": 0.24906600249066002,
172
+ "eval_loss": 3.7072134017944336,
173
+ "eval_runtime": 0.2742,
174
+ "eval_samples_per_second": 156.807,
175
+ "eval_steps_per_second": 3.647,
176
+ "step": 200
177
+ },
178
+ {
179
+ "epoch": 0.261519302615193,
180
+ "grad_norm": 30.5,
181
+ "learning_rate": 2.09e-05,
182
+ "loss": 2.8723,
183
+ "step": 210
184
+ },
185
+ {
186
+ "epoch": 0.273972602739726,
187
+ "grad_norm": 19.625,
188
+ "learning_rate": 2.19e-05,
189
+ "loss": 2.6908,
190
+ "step": 220
191
+ },
192
+ {
193
+ "epoch": 0.286425902864259,
194
+ "grad_norm": 18.25,
195
+ "learning_rate": 2.29e-05,
196
+ "loss": 2.4715,
197
+ "step": 230
198
+ },
199
+ {
200
+ "epoch": 0.298879202988792,
201
+ "grad_norm": 16.75,
202
+ "learning_rate": 2.39e-05,
203
+ "loss": 2.4336,
204
+ "step": 240
205
+ },
206
+ {
207
+ "epoch": 0.31133250311332505,
208
+ "grad_norm": 16.875,
209
+ "learning_rate": 2.4900000000000002e-05,
210
+ "loss": 2.3797,
211
+ "step": 250
212
+ },
213
+ {
214
+ "epoch": 0.32378580323785805,
215
+ "grad_norm": 18.375,
216
+ "learning_rate": 2.5900000000000003e-05,
217
+ "loss": 2.2765,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 0.33623910336239105,
222
+ "grad_norm": 13.625,
223
+ "learning_rate": 2.6900000000000003e-05,
224
+ "loss": 2.1124,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 0.34869240348692404,
229
+ "grad_norm": 19.5,
230
+ "learning_rate": 2.7900000000000004e-05,
231
+ "loss": 2.0748,
232
+ "step": 280
233
+ },
234
+ {
235
+ "epoch": 0.36114570361145704,
236
+ "grad_norm": 18.0,
237
+ "learning_rate": 2.8899999999999998e-05,
238
+ "loss": 2.1575,
239
+ "step": 290
240
+ },
241
+ {
242
+ "epoch": 0.37359900373599003,
243
+ "grad_norm": 34.0,
244
+ "learning_rate": 2.9900000000000002e-05,
245
+ "loss": 2.1195,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 0.37359900373599003,
250
+ "eval/acc": 23.255813598632812,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 0.37359900373599003,
255
+ "eval_loss": 3.1418063640594482,
256
+ "eval_runtime": 1.1652,
257
+ "eval_samples_per_second": 36.903,
258
+ "eval_steps_per_second": 0.858,
259
+ "step": 300
260
+ },
261
+ {
262
+ "epoch": 0.386052303860523,
263
+ "grad_norm": 19.125,
264
+ "learning_rate": 3.09e-05,
265
+ "loss": 2.1052,
266
+ "step": 310
267
+ },
268
+ {
269
+ "epoch": 0.398505603985056,
270
+ "grad_norm": 20.375,
271
+ "learning_rate": 3.19e-05,
272
+ "loss": 1.8924,
273
+ "step": 320
274
+ },
275
+ {
276
+ "epoch": 0.410958904109589,
277
+ "grad_norm": 17.125,
278
+ "learning_rate": 3.29e-05,
279
+ "loss": 2.025,
280
+ "step": 330
281
+ },
282
+ {
283
+ "epoch": 0.42341220423412207,
284
+ "grad_norm": 28.0,
285
+ "learning_rate": 3.3900000000000004e-05,
286
+ "loss": 1.8914,
287
+ "step": 340
288
+ },
289
+ {
290
+ "epoch": 0.43586550435865506,
291
+ "grad_norm": 22.125,
292
+ "learning_rate": 3.49e-05,
293
+ "loss": 1.8864,
294
+ "step": 350
295
+ },
296
+ {
297
+ "epoch": 0.44831880448318806,
298
+ "grad_norm": 34.0,
299
+ "learning_rate": 3.59e-05,
300
+ "loss": 1.8447,
301
+ "step": 360
302
+ },
303
+ {
304
+ "epoch": 0.46077210460772106,
305
+ "grad_norm": 15.4375,
306
+ "learning_rate": 3.69e-05,
307
+ "loss": 1.7981,
308
+ "step": 370
309
+ },
310
+ {
311
+ "epoch": 0.47322540473225405,
312
+ "grad_norm": 39.25,
313
+ "learning_rate": 3.79e-05,
314
+ "loss": 1.6967,
315
+ "step": 380
316
+ },
317
+ {
318
+ "epoch": 0.48567870485678705,
319
+ "grad_norm": 35.25,
320
+ "learning_rate": 3.8900000000000004e-05,
321
+ "loss": 1.7919,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 0.49813200498132004,
326
+ "grad_norm": 19.875,
327
+ "learning_rate": 3.99e-05,
328
+ "loss": 1.6083,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 0.49813200498132004,
333
+ "eval/acc": 27.9069766998291,
334
+ "step": 400
335
+ },
336
+ {
337
+ "epoch": 0.49813200498132004,
338
+ "eval_loss": 2.988025665283203,
339
+ "eval_runtime": 0.2197,
340
+ "eval_samples_per_second": 195.684,
341
+ "eval_steps_per_second": 4.551,
342
+ "step": 400
343
+ },
344
+ {
345
+ "epoch": 0.5105853051058531,
346
+ "grad_norm": 14.1875,
347
+ "learning_rate": 4.09e-05,
348
+ "loss": 1.7039,
349
+ "step": 410
350
+ },
351
+ {
352
+ "epoch": 0.523038605230386,
353
+ "grad_norm": 33.25,
354
+ "learning_rate": 4.19e-05,
355
+ "loss": 1.7057,
356
+ "step": 420
357
+ },
358
+ {
359
+ "epoch": 0.5354919053549191,
360
+ "grad_norm": 15.5,
361
+ "learning_rate": 4.29e-05,
362
+ "loss": 1.6425,
363
+ "step": 430
364
+ },
365
+ {
366
+ "epoch": 0.547945205479452,
367
+ "grad_norm": 29.625,
368
+ "learning_rate": 4.39e-05,
369
+ "loss": 1.4995,
370
+ "step": 440
371
+ },
372
+ {
373
+ "epoch": 0.5603985056039851,
374
+ "grad_norm": 15.0625,
375
+ "learning_rate": 4.49e-05,
376
+ "loss": 1.6621,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 0.572851805728518,
381
+ "grad_norm": 22.25,
382
+ "learning_rate": 4.5900000000000004e-05,
383
+ "loss": 1.5684,
384
+ "step": 460
385
+ },
386
+ {
387
+ "epoch": 0.5853051058530511,
388
+ "grad_norm": 17.25,
389
+ "learning_rate": 4.69e-05,
390
+ "loss": 1.5414,
391
+ "step": 470
392
+ },
393
+ {
394
+ "epoch": 0.597758405977584,
395
+ "grad_norm": 19.25,
396
+ "learning_rate": 4.79e-05,
397
+ "loss": 1.5445,
398
+ "step": 480
399
+ },
400
+ {
401
+ "epoch": 0.6102117061021171,
402
+ "grad_norm": 205.0,
403
+ "learning_rate": 4.89e-05,
404
+ "loss": 1.4726,
405
+ "step": 490
406
+ },
407
+ {
408
+ "epoch": 0.6226650062266501,
409
+ "grad_norm": 13.375,
410
+ "learning_rate": 4.99e-05,
411
+ "loss": 1.3783,
412
+ "step": 500
413
+ },
414
+ {
415
+ "epoch": 0.6226650062266501,
416
+ "eval/acc": 30.23255729675293,
417
+ "step": 500
418
+ },
419
+ {
420
+ "epoch": 0.6226650062266501,
421
+ "eval_loss": 2.777118444442749,
422
+ "eval_runtime": 0.2153,
423
+ "eval_samples_per_second": 199.749,
424
+ "eval_steps_per_second": 4.645,
425
+ "step": 500
426
+ },
427
+ {
428
+ "epoch": 0.635118306351183,
429
+ "grad_norm": 20.875,
430
+ "learning_rate": 5.0900000000000004e-05,
431
+ "loss": 1.4983,
432
+ "step": 510
433
+ },
434
+ {
435
+ "epoch": 0.6475716064757161,
436
+ "grad_norm": 16.625,
437
+ "learning_rate": 5.19e-05,
438
+ "loss": 1.5093,
439
+ "step": 520
440
+ },
441
+ {
442
+ "epoch": 0.660024906600249,
443
+ "grad_norm": 14.125,
444
+ "learning_rate": 5.2900000000000005e-05,
445
+ "loss": 1.4588,
446
+ "step": 530
447
+ },
448
+ {
449
+ "epoch": 0.6724782067247821,
450
+ "grad_norm": 37.0,
451
+ "learning_rate": 5.390000000000001e-05,
452
+ "loss": 1.4346,
453
+ "step": 540
454
+ },
455
+ {
456
+ "epoch": 0.684931506849315,
457
+ "grad_norm": 16.75,
458
+ "learning_rate": 5.4900000000000006e-05,
459
+ "loss": 1.5363,
460
+ "step": 550
461
+ },
462
+ {
463
+ "epoch": 0.6973848069738481,
464
+ "grad_norm": 28.375,
465
+ "learning_rate": 5.590000000000001e-05,
466
+ "loss": 1.4497,
467
+ "step": 560
468
+ },
469
+ {
470
+ "epoch": 0.709838107098381,
471
+ "grad_norm": 15.5625,
472
+ "learning_rate": 5.69e-05,
473
+ "loss": 1.4005,
474
+ "step": 570
475
+ },
476
+ {
477
+ "epoch": 0.7222914072229141,
478
+ "grad_norm": 14.75,
479
+ "learning_rate": 5.79e-05,
480
+ "loss": 1.4588,
481
+ "step": 580
482
+ },
483
+ {
484
+ "epoch": 0.7347447073474471,
485
+ "grad_norm": 18.5,
486
+ "learning_rate": 5.89e-05,
487
+ "loss": 1.3489,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 0.7471980074719801,
492
+ "grad_norm": 12.125,
493
+ "learning_rate": 5.99e-05,
494
+ "loss": 1.3295,
495
+ "step": 600
496
+ },
497
+ {
498
+ "epoch": 0.7471980074719801,
499
+ "eval/acc": 39.53488540649414,
500
+ "step": 600
501
+ },
502
+ {
503
+ "epoch": 0.7471980074719801,
504
+ "eval_loss": 2.6652462482452393,
505
+ "eval_runtime": 0.2211,
506
+ "eval_samples_per_second": 194.477,
507
+ "eval_steps_per_second": 4.523,
508
+ "step": 600
509
+ },
510
+ {
511
+ "epoch": 0.7596513075965131,
512
+ "grad_norm": 12.9375,
513
+ "learning_rate": 6.09e-05,
514
+ "loss": 1.3717,
515
+ "step": 610
516
+ },
517
+ {
518
+ "epoch": 0.772104607721046,
519
+ "grad_norm": 21.5,
520
+ "learning_rate": 6.19e-05,
521
+ "loss": 1.425,
522
+ "step": 620
523
+ },
524
+ {
525
+ "epoch": 0.7845579078455791,
526
+ "grad_norm": 13.6875,
527
+ "learning_rate": 6.29e-05,
528
+ "loss": 1.3017,
529
+ "step": 630
530
+ },
531
+ {
532
+ "epoch": 0.797011207970112,
533
+ "grad_norm": 12.8125,
534
+ "learning_rate": 6.390000000000001e-05,
535
+ "loss": 1.3533,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 0.8094645080946451,
540
+ "grad_norm": 13.1875,
541
+ "learning_rate": 6.49e-05,
542
+ "loss": 1.271,
543
+ "step": 650
544
+ },
545
+ {
546
+ "epoch": 0.821917808219178,
547
+ "grad_norm": 15.125,
548
+ "learning_rate": 6.59e-05,
549
+ "loss": 1.3734,
550
+ "step": 660
551
+ },
552
+ {
553
+ "epoch": 0.8343711083437111,
554
+ "grad_norm": 16.125,
555
+ "learning_rate": 6.690000000000001e-05,
556
+ "loss": 1.3092,
557
+ "step": 670
558
+ },
559
+ {
560
+ "epoch": 0.8468244084682441,
561
+ "grad_norm": 17.75,
562
+ "learning_rate": 6.790000000000001e-05,
563
+ "loss": 1.1803,
564
+ "step": 680
565
+ },
566
+ {
567
+ "epoch": 0.8592777085927771,
568
+ "grad_norm": 13.875,
569
+ "learning_rate": 6.89e-05,
570
+ "loss": 1.3383,
571
+ "step": 690
572
+ },
573
+ {
574
+ "epoch": 0.8717310087173101,
575
+ "grad_norm": 11.25,
576
+ "learning_rate": 6.99e-05,
577
+ "loss": 1.3024,
578
+ "step": 700
579
+ },
580
+ {
581
+ "epoch": 0.8717310087173101,
582
+ "eval/acc": 34.88372039794922,
583
+ "step": 700
584
+ },
585
+ {
586
+ "epoch": 0.8717310087173101,
587
+ "eval_loss": 2.7215068340301514,
588
+ "eval_runtime": 0.3836,
589
+ "eval_samples_per_second": 112.097,
590
+ "eval_steps_per_second": 2.607,
591
+ "step": 700
592
+ },
593
+ {
594
+ "epoch": 0.8841843088418431,
595
+ "grad_norm": 67.0,
596
+ "learning_rate": 7.09e-05,
597
+ "loss": 1.2095,
598
+ "step": 710
599
+ },
600
+ {
601
+ "epoch": 0.8966376089663761,
602
+ "grad_norm": 9.875,
603
+ "learning_rate": 7.19e-05,
604
+ "loss": 1.2948,
605
+ "step": 720
606
+ },
607
+ {
608
+ "epoch": 0.9090909090909091,
609
+ "grad_norm": 14.1875,
610
+ "learning_rate": 7.29e-05,
611
+ "loss": 1.3225,
612
+ "step": 730
613
+ },
614
+ {
615
+ "epoch": 0.9215442092154421,
616
+ "grad_norm": 13.125,
617
+ "learning_rate": 7.390000000000001e-05,
618
+ "loss": 1.1936,
619
+ "step": 740
620
+ },
621
+ {
622
+ "epoch": 0.933997509339975,
623
+ "grad_norm": 12.875,
624
+ "learning_rate": 7.49e-05,
625
+ "loss": 1.2211,
626
+ "step": 750
627
+ },
628
+ {
629
+ "epoch": 0.9464508094645081,
630
+ "grad_norm": 13.5625,
631
+ "learning_rate": 7.59e-05,
632
+ "loss": 1.2435,
633
+ "step": 760
634
+ },
635
+ {
636
+ "epoch": 0.958904109589041,
637
+ "grad_norm": 19.25,
638
+ "learning_rate": 7.69e-05,
639
+ "loss": 1.1786,
640
+ "step": 770
641
+ },
642
+ {
643
+ "epoch": 0.9713574097135741,
644
+ "grad_norm": 14.375,
645
+ "learning_rate": 7.790000000000001e-05,
646
+ "loss": 1.2784,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 0.9838107098381071,
651
+ "grad_norm": 10.625,
652
+ "learning_rate": 7.890000000000001e-05,
653
+ "loss": 1.2618,
654
+ "step": 790
655
+ },
656
+ {
657
+ "epoch": 0.9962640099626401,
658
+ "grad_norm": 22.75,
659
+ "learning_rate": 7.99e-05,
660
+ "loss": 1.1601,
661
+ "step": 800
662
+ },
663
+ {
664
+ "epoch": 0.9962640099626401,
665
+ "eval/acc": 41.86046600341797,
666
+ "step": 800
667
+ },
668
+ {
669
+ "epoch": 0.9962640099626401,
670
+ "eval_loss": 2.622220516204834,
671
+ "eval_runtime": 0.2076,
672
+ "eval_samples_per_second": 207.126,
673
+ "eval_steps_per_second": 4.817,
674
+ "step": 800
675
+ },
676
+ {
677
+ "epoch": 1.0087173100871731,
678
+ "grad_norm": 12.75,
679
+ "learning_rate": 8.090000000000001e-05,
680
+ "loss": 1.0972,
681
+ "step": 810
682
+ },
683
+ {
684
+ "epoch": 1.0211706102117062,
685
+ "grad_norm": 9.6875,
686
+ "learning_rate": 8.19e-05,
687
+ "loss": 1.1999,
688
+ "step": 820
689
+ },
690
+ {
691
+ "epoch": 1.033623910336239,
692
+ "grad_norm": 11.625,
693
+ "learning_rate": 8.29e-05,
694
+ "loss": 1.1677,
695
+ "step": 830
696
+ },
697
+ {
698
+ "epoch": 1.046077210460772,
699
+ "grad_norm": 12.8125,
700
+ "learning_rate": 8.39e-05,
701
+ "loss": 1.1505,
702
+ "step": 840
703
+ },
704
+ {
705
+ "epoch": 1.0585305105853051,
706
+ "grad_norm": 11.6875,
707
+ "learning_rate": 8.49e-05,
708
+ "loss": 1.1599,
709
+ "step": 850
710
+ },
711
+ {
712
+ "epoch": 1.0709838107098382,
713
+ "grad_norm": 9.8125,
714
+ "learning_rate": 8.59e-05,
715
+ "loss": 1.1746,
716
+ "step": 860
717
+ },
718
+ {
719
+ "epoch": 1.083437110834371,
720
+ "grad_norm": 11.625,
721
+ "learning_rate": 8.69e-05,
722
+ "loss": 1.047,
723
+ "step": 870
724
+ },
725
+ {
726
+ "epoch": 1.095890410958904,
727
+ "grad_norm": 10.125,
728
+ "learning_rate": 8.790000000000001e-05,
729
+ "loss": 1.107,
730
+ "step": 880
731
+ },
732
+ {
733
+ "epoch": 1.108343711083437,
734
+ "grad_norm": 9.0,
735
+ "learning_rate": 8.89e-05,
736
+ "loss": 1.1105,
737
+ "step": 890
738
+ },
739
+ {
740
+ "epoch": 1.1207970112079702,
741
+ "grad_norm": 13.125,
742
+ "learning_rate": 8.99e-05,
743
+ "loss": 1.1848,
744
+ "step": 900
745
+ },
746
+ {
747
+ "epoch": 1.1207970112079702,
748
+ "eval/acc": 34.88372039794922,
749
+ "step": 900
750
+ },
751
+ {
752
+ "epoch": 1.1207970112079702,
753
+ "eval_loss": 2.8814988136291504,
754
+ "eval_runtime": 1.0687,
755
+ "eval_samples_per_second": 40.237,
756
+ "eval_steps_per_second": 0.936,
757
+ "step": 900
758
+ },
759
+ {
760
+ "epoch": 1.1332503113325032,
761
+ "grad_norm": 13.25,
762
+ "learning_rate": 9.090000000000001e-05,
763
+ "loss": 1.1235,
764
+ "step": 910
765
+ },
766
+ {
767
+ "epoch": 1.145703611457036,
768
+ "grad_norm": 17.625,
769
+ "learning_rate": 9.190000000000001e-05,
770
+ "loss": 1.0304,
771
+ "step": 920
772
+ },
773
+ {
774
+ "epoch": 1.158156911581569,
775
+ "grad_norm": 11.5625,
776
+ "learning_rate": 9.290000000000001e-05,
777
+ "loss": 1.0373,
778
+ "step": 930
779
+ },
780
+ {
781
+ "epoch": 1.1706102117061021,
782
+ "grad_norm": 13.25,
783
+ "learning_rate": 9.39e-05,
784
+ "loss": 1.12,
785
+ "step": 940
786
+ },
787
+ {
788
+ "epoch": 1.1830635118306352,
789
+ "grad_norm": 10.4375,
790
+ "learning_rate": 9.49e-05,
791
+ "loss": 1.0623,
792
+ "step": 950
793
+ },
794
+ {
795
+ "epoch": 1.195516811955168,
796
+ "grad_norm": 14.625,
797
+ "learning_rate": 9.59e-05,
798
+ "loss": 1.0692,
799
+ "step": 960
800
+ },
801
+ {
802
+ "epoch": 1.207970112079701,
803
+ "grad_norm": 9.6875,
804
+ "learning_rate": 9.69e-05,
805
+ "loss": 1.1914,
806
+ "step": 970
807
+ },
808
+ {
809
+ "epoch": 1.2204234122042341,
810
+ "grad_norm": 10.4375,
811
+ "learning_rate": 9.790000000000001e-05,
812
+ "loss": 1.1094,
813
+ "step": 980
814
+ },
815
+ {
816
+ "epoch": 1.2328767123287672,
817
+ "grad_norm": 9.625,
818
+ "learning_rate": 9.89e-05,
819
+ "loss": 1.0557,
820
+ "step": 990
821
+ },
822
+ {
823
+ "epoch": 1.2453300124533002,
824
+ "grad_norm": 15.75,
825
+ "learning_rate": 9.99e-05,
826
+ "loss": 0.9635,
827
+ "step": 1000
828
+ },
829
+ {
830
+ "epoch": 1.2453300124533002,
831
+ "eval/acc": 34.88372039794922,
832
+ "step": 1000
833
+ },
834
+ {
835
+ "epoch": 1.2453300124533002,
836
+ "eval_loss": 2.967315435409546,
837
+ "eval_runtime": 0.2242,
838
+ "eval_samples_per_second": 191.798,
839
+ "eval_steps_per_second": 4.46,
840
+ "step": 1000
841
+ },
842
+ {
843
+ "epoch": 1.257783312577833,
844
+ "grad_norm": 11.8125,
845
+ "learning_rate": 9.99e-05,
846
+ "loss": 1.0067,
847
+ "step": 1010
848
+ },
849
+ {
850
+ "epoch": 1.270236612702366,
851
+ "grad_norm": 11.4375,
852
+ "learning_rate": 9.97888888888889e-05,
853
+ "loss": 1.0609,
854
+ "step": 1020
855
+ },
856
+ {
857
+ "epoch": 1.2826899128268991,
858
+ "grad_norm": 12.875,
859
+ "learning_rate": 9.967777777777779e-05,
860
+ "loss": 1.1566,
861
+ "step": 1030
862
+ },
863
+ {
864
+ "epoch": 1.2951432129514322,
865
+ "grad_norm": 10.625,
866
+ "learning_rate": 9.956666666666667e-05,
867
+ "loss": 1.1045,
868
+ "step": 1040
869
+ },
870
+ {
871
+ "epoch": 1.307596513075965,
872
+ "grad_norm": 10.0625,
873
+ "learning_rate": 9.945555555555555e-05,
874
+ "loss": 1.1421,
875
+ "step": 1050
876
+ },
877
+ {
878
+ "epoch": 1.320049813200498,
879
+ "grad_norm": 11.5625,
880
+ "learning_rate": 9.934444444444445e-05,
881
+ "loss": 1.0453,
882
+ "step": 1060
883
+ },
884
+ {
885
+ "epoch": 1.3325031133250311,
886
+ "grad_norm": 11.3125,
887
+ "learning_rate": 9.923333333333334e-05,
888
+ "loss": 1.0531,
889
+ "step": 1070
890
+ },
891
+ {
892
+ "epoch": 1.3449564134495642,
893
+ "grad_norm": 11.75,
894
+ "learning_rate": 9.912222222222222e-05,
895
+ "loss": 1.0286,
896
+ "step": 1080
897
+ },
898
+ {
899
+ "epoch": 1.3574097135740972,
900
+ "grad_norm": 11.3125,
901
+ "learning_rate": 9.901111111111112e-05,
902
+ "loss": 0.9549,
903
+ "step": 1090
904
+ },
905
+ {
906
+ "epoch": 1.36986301369863,
907
+ "grad_norm": 10.5625,
908
+ "learning_rate": 9.89e-05,
909
+ "loss": 1.006,
910
+ "step": 1100
911
+ },
912
+ {
913
+ "epoch": 1.36986301369863,
914
+ "eval/acc": 34.88372039794922,
915
+ "step": 1100
916
+ },
917
+ {
918
+ "epoch": 1.36986301369863,
919
+ "eval_loss": 2.9681856632232666,
920
+ "eval_runtime": 0.2343,
921
+ "eval_samples_per_second": 183.518,
922
+ "eval_steps_per_second": 4.268,
923
+ "step": 1100
924
+ },
925
+ {
926
+ "epoch": 1.3823163138231631,
927
+ "grad_norm": 13.4375,
928
+ "learning_rate": 9.87888888888889e-05,
929
+ "loss": 1.049,
930
+ "step": 1110
931
+ },
932
+ {
933
+ "epoch": 1.3947696139476962,
934
+ "grad_norm": 13.125,
935
+ "learning_rate": 9.867777777777777e-05,
936
+ "loss": 0.951,
937
+ "step": 1120
938
+ },
939
+ {
940
+ "epoch": 1.4072229140722292,
941
+ "grad_norm": 8.6875,
942
+ "learning_rate": 9.856666666666667e-05,
943
+ "loss": 1.0806,
944
+ "step": 1130
945
+ },
946
+ {
947
+ "epoch": 1.419676214196762,
948
+ "grad_norm": 11.8125,
949
+ "learning_rate": 9.845555555555556e-05,
950
+ "loss": 0.9683,
951
+ "step": 1140
952
+ },
953
+ {
954
+ "epoch": 1.432129514321295,
955
+ "grad_norm": 14.875,
956
+ "learning_rate": 9.834444444444446e-05,
957
+ "loss": 0.977,
958
+ "step": 1150
959
+ },
960
+ {
961
+ "epoch": 1.4445828144458281,
962
+ "grad_norm": 20.125,
963
+ "learning_rate": 9.823333333333333e-05,
964
+ "loss": 0.994,
965
+ "step": 1160
966
+ },
967
+ {
968
+ "epoch": 1.4570361145703612,
969
+ "grad_norm": 11.0,
970
+ "learning_rate": 9.812222222222223e-05,
971
+ "loss": 1.037,
972
+ "step": 1170
973
+ },
974
+ {
975
+ "epoch": 1.4694894146948942,
976
+ "grad_norm": 15.5,
977
+ "learning_rate": 9.801111111111112e-05,
978
+ "loss": 1.1605,
979
+ "step": 1180
980
+ },
981
+ {
982
+ "epoch": 1.481942714819427,
983
+ "grad_norm": 10.9375,
984
+ "learning_rate": 9.790000000000001e-05,
985
+ "loss": 1.0113,
986
+ "step": 1190
987
+ },
988
+ {
989
+ "epoch": 1.4943960149439601,
990
+ "grad_norm": 14.3125,
991
+ "learning_rate": 9.778888888888889e-05,
992
+ "loss": 0.9511,
993
+ "step": 1200
994
+ },
995
+ {
996
+ "epoch": 1.4943960149439601,
997
+ "eval/acc": 37.20930099487305,
998
+ "step": 1200
999
+ },
1000
+ {
1001
+ "epoch": 1.4943960149439601,
1002
+ "eval_loss": 2.701927423477173,
1003
+ "eval_runtime": 0.2099,
1004
+ "eval_samples_per_second": 204.857,
1005
+ "eval_steps_per_second": 4.764,
1006
+ "step": 1200
1007
+ },
1008
+ {
1009
+ "epoch": 1.5068493150684932,
1010
+ "grad_norm": 11.9375,
1011
+ "learning_rate": 9.767777777777778e-05,
1012
+ "loss": 1.0408,
1013
+ "step": 1210
1014
+ },
1015
+ {
1016
+ "epoch": 1.519302615193026,
1017
+ "grad_norm": 7.71875,
1018
+ "learning_rate": 9.756666666666668e-05,
1019
+ "loss": 0.9782,
1020
+ "step": 1220
1021
+ },
1022
+ {
1023
+ "epoch": 1.531755915317559,
1024
+ "grad_norm": 7.5,
1025
+ "learning_rate": 9.745555555555556e-05,
1026
+ "loss": 1.0293,
1027
+ "step": 1230
1028
+ },
1029
+ {
1030
+ "epoch": 1.544209215442092,
1031
+ "grad_norm": 9.6875,
1032
+ "learning_rate": 9.734444444444444e-05,
1033
+ "loss": 0.9718,
1034
+ "step": 1240
1035
+ },
1036
+ {
1037
+ "epoch": 1.5566625155666252,
1038
+ "grad_norm": 11.0,
1039
+ "learning_rate": 9.723333333333334e-05,
1040
+ "loss": 1.0542,
1041
+ "step": 1250
1042
+ },
1043
+ {
1044
+ "epoch": 1.5691158156911582,
1045
+ "grad_norm": 10.5,
1046
+ "learning_rate": 9.712222222222223e-05,
1047
+ "loss": 0.9537,
1048
+ "step": 1260
1049
+ },
1050
+ {
1051
+ "epoch": 1.5815691158156913,
1052
+ "grad_norm": 13.1875,
1053
+ "learning_rate": 9.701111111111111e-05,
1054
+ "loss": 0.9756,
1055
+ "step": 1270
1056
+ },
1057
+ {
1058
+ "epoch": 1.5940224159402243,
1059
+ "grad_norm": 9.9375,
1060
+ "learning_rate": 9.69e-05,
1061
+ "loss": 0.8843,
1062
+ "step": 1280
1063
+ },
1064
+ {
1065
+ "epoch": 1.6064757160647571,
1066
+ "grad_norm": 10.0,
1067
+ "learning_rate": 9.67888888888889e-05,
1068
+ "loss": 0.8808,
1069
+ "step": 1290
1070
+ },
1071
+ {
1072
+ "epoch": 1.6189290161892902,
1073
+ "grad_norm": 14.0,
1074
+ "learning_rate": 9.667777777777778e-05,
1075
+ "loss": 0.9589,
1076
+ "step": 1300
1077
+ },
1078
+ {
1079
+ "epoch": 1.6189290161892902,
1080
+ "eval/acc": 39.53488540649414,
1081
+ "step": 1300
1082
+ },
1083
+ {
1084
+ "epoch": 1.6189290161892902,
1085
+ "eval_loss": 2.7926037311553955,
1086
+ "eval_runtime": 0.2238,
1087
+ "eval_samples_per_second": 192.128,
1088
+ "eval_steps_per_second": 4.468,
1089
+ "step": 1300
1090
+ },
1091
+ {
1092
+ "epoch": 1.631382316313823,
1093
+ "grad_norm": 8.9375,
1094
+ "learning_rate": 9.656666666666668e-05,
1095
+ "loss": 0.9315,
1096
+ "step": 1310
1097
+ },
1098
+ {
1099
+ "epoch": 1.643835616438356,
1100
+ "grad_norm": 10.0625,
1101
+ "learning_rate": 9.645555555555556e-05,
1102
+ "loss": 0.9295,
1103
+ "step": 1320
1104
+ },
1105
+ {
1106
+ "epoch": 1.6562889165628891,
1107
+ "grad_norm": 8.75,
1108
+ "learning_rate": 9.634444444444445e-05,
1109
+ "loss": 0.9255,
1110
+ "step": 1330
1111
+ },
1112
+ {
1113
+ "epoch": 1.6687422166874222,
1114
+ "grad_norm": 11.0625,
1115
+ "learning_rate": 9.623333333333335e-05,
1116
+ "loss": 0.9121,
1117
+ "step": 1340
1118
+ },
1119
+ {
1120
+ "epoch": 1.6811955168119552,
1121
+ "grad_norm": 11.375,
1122
+ "learning_rate": 9.612222222222223e-05,
1123
+ "loss": 0.9232,
1124
+ "step": 1350
1125
+ },
1126
+ {
1127
+ "epoch": 1.6936488169364883,
1128
+ "grad_norm": 11.875,
1129
+ "learning_rate": 9.601111111111112e-05,
1130
+ "loss": 0.8991,
1131
+ "step": 1360
1132
+ },
1133
+ {
1134
+ "epoch": 1.7061021170610213,
1135
+ "grad_norm": 9.0,
1136
+ "learning_rate": 9.59e-05,
1137
+ "loss": 0.9405,
1138
+ "step": 1370
1139
+ },
1140
+ {
1141
+ "epoch": 1.7185554171855542,
1142
+ "grad_norm": 11.875,
1143
+ "learning_rate": 9.57888888888889e-05,
1144
+ "loss": 1.0191,
1145
+ "step": 1380
1146
+ },
1147
+ {
1148
+ "epoch": 1.7310087173100872,
1149
+ "grad_norm": 9.8125,
1150
+ "learning_rate": 9.567777777777778e-05,
1151
+ "loss": 0.9002,
1152
+ "step": 1390
1153
+ },
1154
+ {
1155
+ "epoch": 1.74346201743462,
1156
+ "grad_norm": 12.375,
1157
+ "learning_rate": 9.556666666666667e-05,
1158
+ "loss": 0.9681,
1159
+ "step": 1400
1160
+ },
1161
+ {
1162
+ "epoch": 1.74346201743462,
1163
+ "eval/acc": 39.53488540649414,
1164
+ "step": 1400
1165
+ },
1166
+ {
1167
+ "epoch": 1.74346201743462,
1168
+ "eval_loss": 2.795476198196411,
1169
+ "eval_runtime": 0.2152,
1170
+ "eval_samples_per_second": 199.833,
1171
+ "eval_steps_per_second": 4.647,
1172
+ "step": 1400
1173
+ },
1174
+ {
1175
+ "epoch": 1.755915317559153,
1176
+ "grad_norm": 9.375,
1177
+ "learning_rate": 9.545555555555557e-05,
1178
+ "loss": 1.0222,
1179
+ "step": 1410
1180
+ },
1181
+ {
1182
+ "epoch": 1.7683686176836861,
1183
+ "grad_norm": 9.5625,
1184
+ "learning_rate": 9.534444444444445e-05,
1185
+ "loss": 0.9005,
1186
+ "step": 1420
1187
+ },
1188
+ {
1189
+ "epoch": 1.7808219178082192,
1190
+ "grad_norm": 9.875,
1191
+ "learning_rate": 9.523333333333334e-05,
1192
+ "loss": 0.9616,
1193
+ "step": 1430
1194
+ },
1195
+ {
1196
+ "epoch": 1.7932752179327522,
1197
+ "grad_norm": 28.0,
1198
+ "learning_rate": 9.512222222222222e-05,
1199
+ "loss": 1.0197,
1200
+ "step": 1440
1201
+ },
1202
+ {
1203
+ "epoch": 1.8057285180572853,
1204
+ "grad_norm": 10.75,
1205
+ "learning_rate": 9.501111111111112e-05,
1206
+ "loss": 0.9947,
1207
+ "step": 1450
1208
+ },
1209
+ {
1210
+ "epoch": 1.8181818181818183,
1211
+ "grad_norm": 10.125,
1212
+ "learning_rate": 9.49e-05,
1213
+ "loss": 0.9064,
1214
+ "step": 1460
1215
+ },
1216
+ {
1217
+ "epoch": 1.8306351183063512,
1218
+ "grad_norm": 11.75,
1219
+ "learning_rate": 9.478888888888889e-05,
1220
+ "loss": 0.9425,
1221
+ "step": 1470
1222
+ },
1223
+ {
1224
+ "epoch": 1.8430884184308842,
1225
+ "grad_norm": 10.625,
1226
+ "learning_rate": 9.467777777777779e-05,
1227
+ "loss": 1.0284,
1228
+ "step": 1480
1229
+ },
1230
+ {
1231
+ "epoch": 1.855541718555417,
1232
+ "grad_norm": 10.125,
1233
+ "learning_rate": 9.456666666666667e-05,
1234
+ "loss": 0.9175,
1235
+ "step": 1490
1236
+ },
1237
+ {
1238
+ "epoch": 1.86799501867995,
1239
+ "grad_norm": 8.375,
1240
+ "learning_rate": 9.445555555555557e-05,
1241
+ "loss": 0.8608,
1242
+ "step": 1500
1243
+ },
1244
+ {
1245
+ "epoch": 1.86799501867995,
1246
+ "eval/acc": 39.53488540649414,
1247
+ "step": 1500
1248
+ },
1249
+ {
1250
+ "epoch": 1.86799501867995,
1251
+ "eval_loss": 2.8291714191436768,
1252
+ "eval_runtime": 0.216,
1253
+ "eval_samples_per_second": 199.031,
1254
+ "eval_steps_per_second": 4.629,
1255
+ "step": 1500
1256
+ },
1257
+ {
1258
+ "epoch": 1.8804483188044832,
1259
+ "grad_norm": 9.625,
1260
+ "learning_rate": 9.434444444444444e-05,
1261
+ "loss": 0.9695,
1262
+ "step": 1510
1263
+ },
1264
+ {
1265
+ "epoch": 1.8929016189290162,
1266
+ "grad_norm": 9.8125,
1267
+ "learning_rate": 9.423333333333334e-05,
1268
+ "loss": 0.9924,
1269
+ "step": 1520
1270
+ },
1271
+ {
1272
+ "epoch": 1.9053549190535493,
1273
+ "grad_norm": 10.0,
1274
+ "learning_rate": 9.412222222222222e-05,
1275
+ "loss": 1.0733,
1276
+ "step": 1530
1277
+ },
1278
+ {
1279
+ "epoch": 1.9178082191780823,
1280
+ "grad_norm": 10.25,
1281
+ "learning_rate": 9.401111111111112e-05,
1282
+ "loss": 0.8818,
1283
+ "step": 1540
1284
+ },
1285
+ {
1286
+ "epoch": 1.9302615193026154,
1287
+ "grad_norm": 15.3125,
1288
+ "learning_rate": 9.39e-05,
1289
+ "loss": 0.9053,
1290
+ "step": 1550
1291
+ },
1292
+ {
1293
+ "epoch": 1.9427148194271482,
1294
+ "grad_norm": 8.25,
1295
+ "learning_rate": 9.378888888888889e-05,
1296
+ "loss": 0.8586,
1297
+ "step": 1560
1298
+ },
1299
+ {
1300
+ "epoch": 1.9551681195516812,
1301
+ "grad_norm": 17.5,
1302
+ "learning_rate": 9.367777777777779e-05,
1303
+ "loss": 0.9316,
1304
+ "step": 1570
1305
+ },
1306
+ {
1307
+ "epoch": 1.967621419676214,
1308
+ "grad_norm": 10.875,
1309
+ "learning_rate": 9.356666666666667e-05,
1310
+ "loss": 1.0195,
1311
+ "step": 1580
1312
+ },
1313
+ {
1314
+ "epoch": 1.9800747198007471,
1315
+ "grad_norm": 9.1875,
1316
+ "learning_rate": 9.345555555555556e-05,
1317
+ "loss": 0.8878,
1318
+ "step": 1590
1319
+ },
1320
+ {
1321
+ "epoch": 1.9925280199252802,
1322
+ "grad_norm": 10.3125,
1323
+ "learning_rate": 9.334444444444444e-05,
1324
+ "loss": 0.9765,
1325
+ "step": 1600
1326
+ },
1327
+ {
1328
+ "epoch": 1.9925280199252802,
1329
+ "eval/acc": 37.20930099487305,
1330
+ "step": 1600
1331
+ },
1332
+ {
1333
+ "epoch": 1.9925280199252802,
1334
+ "eval_loss": 2.9084553718566895,
1335
+ "eval_runtime": 0.2099,
1336
+ "eval_samples_per_second": 204.856,
1337
+ "eval_steps_per_second": 4.764,
1338
+ "step": 1600
1339
+ },
1340
+ {
1341
+ "epoch": 2.004981320049813,
1342
+ "grad_norm": 8.6875,
1343
+ "learning_rate": 9.323333333333334e-05,
1344
+ "loss": 0.8596,
1345
+ "step": 1610
1346
+ },
1347
+ {
1348
+ "epoch": 2.0174346201743463,
1349
+ "grad_norm": 12.0625,
1350
+ "learning_rate": 9.312222222222223e-05,
1351
+ "loss": 0.9156,
1352
+ "step": 1620
1353
+ },
1354
+ {
1355
+ "epoch": 2.0298879202988793,
1356
+ "grad_norm": 10.1875,
1357
+ "learning_rate": 9.301111111111111e-05,
1358
+ "loss": 0.8404,
1359
+ "step": 1630
1360
+ },
1361
+ {
1362
+ "epoch": 2.0423412204234124,
1363
+ "grad_norm": 10.125,
1364
+ "learning_rate": 9.290000000000001e-05,
1365
+ "loss": 0.8111,
1366
+ "step": 1640
1367
+ },
1368
+ {
1369
+ "epoch": 2.0547945205479454,
1370
+ "grad_norm": 10.625,
1371
+ "learning_rate": 9.278888888888889e-05,
1372
+ "loss": 0.8124,
1373
+ "step": 1650
1374
+ },
1375
+ {
1376
+ "epoch": 2.067247820672478,
1377
+ "grad_norm": 10.0,
1378
+ "learning_rate": 9.267777777777779e-05,
1379
+ "loss": 0.8124,
1380
+ "step": 1660
1381
+ },
1382
+ {
1383
+ "epoch": 2.079701120797011,
1384
+ "grad_norm": 10.75,
1385
+ "learning_rate": 9.256666666666666e-05,
1386
+ "loss": 0.8384,
1387
+ "step": 1670
1388
+ },
1389
+ {
1390
+ "epoch": 2.092154420921544,
1391
+ "grad_norm": 8.3125,
1392
+ "learning_rate": 9.245555555555556e-05,
1393
+ "loss": 0.8734,
1394
+ "step": 1680
1395
+ },
1396
+ {
1397
+ "epoch": 2.104607721046077,
1398
+ "grad_norm": 8.6875,
1399
+ "learning_rate": 9.234444444444445e-05,
1400
+ "loss": 0.7674,
1401
+ "step": 1690
1402
+ },
1403
+ {
1404
+ "epoch": 2.1170610211706102,
1405
+ "grad_norm": 6.6875,
1406
+ "learning_rate": 9.223333333333334e-05,
1407
+ "loss": 0.8514,
1408
+ "step": 1700
1409
+ },
1410
+ {
1411
+ "epoch": 2.1170610211706102,
1412
+ "eval/acc": 48.83720779418945,
1413
+ "step": 1700
1414
+ },
1415
+ {
1416
+ "epoch": 2.1170610211706102,
1417
+ "eval_loss": 1.9776915311813354,
1418
+ "eval_runtime": 1.2116,
1419
+ "eval_samples_per_second": 35.491,
1420
+ "eval_steps_per_second": 0.825,
1421
+ "step": 1700
1422
+ },
1423
+ {
1424
+ "epoch": 2.1295143212951433,
1425
+ "grad_norm": 11.3125,
1426
+ "learning_rate": 9.212222222222223e-05,
1427
+ "loss": 0.8502,
1428
+ "step": 1710
1429
+ },
1430
+ {
1431
+ "epoch": 2.1419676214196763,
1432
+ "grad_norm": 12.0625,
1433
+ "learning_rate": 9.201111111111111e-05,
1434
+ "loss": 0.9026,
1435
+ "step": 1720
1436
+ },
1437
+ {
1438
+ "epoch": 2.1544209215442094,
1439
+ "grad_norm": 8.3125,
1440
+ "learning_rate": 9.190000000000001e-05,
1441
+ "loss": 0.7893,
1442
+ "step": 1730
1443
+ },
1444
+ {
1445
+ "epoch": 2.166874221668742,
1446
+ "grad_norm": 14.5625,
1447
+ "learning_rate": 9.17888888888889e-05,
1448
+ "loss": 0.7671,
1449
+ "step": 1740
1450
+ },
1451
+ {
1452
+ "epoch": 2.179327521793275,
1453
+ "grad_norm": 11.25,
1454
+ "learning_rate": 9.167777777777778e-05,
1455
+ "loss": 0.7869,
1456
+ "step": 1750
1457
+ },
1458
+ {
1459
+ "epoch": 2.191780821917808,
1460
+ "grad_norm": 8.625,
1461
+ "learning_rate": 9.156666666666667e-05,
1462
+ "loss": 0.8251,
1463
+ "step": 1760
1464
+ },
1465
+ {
1466
+ "epoch": 2.204234122042341,
1467
+ "grad_norm": 7.8125,
1468
+ "learning_rate": 9.145555555555556e-05,
1469
+ "loss": 0.7838,
1470
+ "step": 1770
1471
+ },
1472
+ {
1473
+ "epoch": 2.216687422166874,
1474
+ "grad_norm": 11.6875,
1475
+ "learning_rate": 9.134444444444445e-05,
1476
+ "loss": 0.8348,
1477
+ "step": 1780
1478
+ },
1479
+ {
1480
+ "epoch": 2.2291407222914073,
1481
+ "grad_norm": 9.75,
1482
+ "learning_rate": 9.123333333333333e-05,
1483
+ "loss": 0.8322,
1484
+ "step": 1790
1485
+ },
1486
+ {
1487
+ "epoch": 2.2415940224159403,
1488
+ "grad_norm": 9.25,
1489
+ "learning_rate": 9.112222222222223e-05,
1490
+ "loss": 0.8514,
1491
+ "step": 1800
1492
+ },
1493
+ {
1494
+ "epoch": 2.2415940224159403,
1495
+ "eval/acc": 48.83720779418945,
1496
+ "step": 1800
1497
+ },
1498
+ {
1499
+ "epoch": 2.2415940224159403,
1500
+ "eval_loss": 1.968414306640625,
1501
+ "eval_runtime": 0.2153,
1502
+ "eval_samples_per_second": 199.733,
1503
+ "eval_steps_per_second": 4.645,
1504
+ "step": 1800
1505
+ },
1506
+ {
1507
+ "epoch": 2.2540473225404734,
1508
+ "grad_norm": 12.6875,
1509
+ "learning_rate": 9.101111111111112e-05,
1510
+ "loss": 0.7841,
1511
+ "step": 1810
1512
+ },
1513
+ {
1514
+ "epoch": 2.2665006226650064,
1515
+ "grad_norm": 7.09375,
1516
+ "learning_rate": 9.090000000000001e-05,
1517
+ "loss": 0.7889,
1518
+ "step": 1820
1519
+ },
1520
+ {
1521
+ "epoch": 2.2789539227895395,
1522
+ "grad_norm": 8.1875,
1523
+ "learning_rate": 9.078888888888889e-05,
1524
+ "loss": 0.8088,
1525
+ "step": 1830
1526
+ },
1527
+ {
1528
+ "epoch": 2.291407222914072,
1529
+ "grad_norm": 12.3125,
1530
+ "learning_rate": 9.067777777777778e-05,
1531
+ "loss": 0.8247,
1532
+ "step": 1840
1533
+ },
1534
+ {
1535
+ "epoch": 2.303860523038605,
1536
+ "grad_norm": 7.40625,
1537
+ "learning_rate": 9.056666666666667e-05,
1538
+ "loss": 0.7383,
1539
+ "step": 1850
1540
+ },
1541
+ {
1542
+ "epoch": 2.316313823163138,
1543
+ "grad_norm": 8.5,
1544
+ "learning_rate": 9.045555555555557e-05,
1545
+ "loss": 0.8074,
1546
+ "step": 1860
1547
+ },
1548
+ {
1549
+ "epoch": 2.328767123287671,
1550
+ "grad_norm": 8.625,
1551
+ "learning_rate": 9.034444444444445e-05,
1552
+ "loss": 0.7866,
1553
+ "step": 1870
1554
+ },
1555
+ {
1556
+ "epoch": 2.3412204234122043,
1557
+ "grad_norm": 10.1875,
1558
+ "learning_rate": 9.023333333333334e-05,
1559
+ "loss": 0.8159,
1560
+ "step": 1880
1561
+ },
1562
+ {
1563
+ "epoch": 2.3536737235367373,
1564
+ "grad_norm": 9.875,
1565
+ "learning_rate": 9.012222222222223e-05,
1566
+ "loss": 0.831,
1567
+ "step": 1890
1568
+ },
1569
+ {
1570
+ "epoch": 2.3661270236612704,
1571
+ "grad_norm": 11.0,
1572
+ "learning_rate": 9.001111111111112e-05,
1573
+ "loss": 0.7215,
1574
+ "step": 1900
1575
+ },
1576
+ {
1577
+ "epoch": 2.3661270236612704,
1578
+ "eval/acc": 48.83720779418945,
1579
+ "step": 1900
1580
+ },
1581
+ {
1582
+ "epoch": 2.3661270236612704,
1583
+ "eval_loss": 1.9242758750915527,
1584
+ "eval_runtime": 0.2274,
1585
+ "eval_samples_per_second": 189.058,
1586
+ "eval_steps_per_second": 4.397,
1587
+ "step": 1900
1588
+ },
1589
+ {
1590
+ "epoch": 2.3785803237858034,
1591
+ "grad_norm": 9.8125,
1592
+ "learning_rate": 8.99e-05,
1593
+ "loss": 0.8346,
1594
+ "step": 1910
1595
+ },
1596
+ {
1597
+ "epoch": 2.391033623910336,
1598
+ "grad_norm": 9.9375,
1599
+ "learning_rate": 8.978888888888889e-05,
1600
+ "loss": 0.8141,
1601
+ "step": 1920
1602
+ },
1603
+ {
1604
+ "epoch": 2.403486924034869,
1605
+ "grad_norm": 10.3125,
1606
+ "learning_rate": 8.967777777777779e-05,
1607
+ "loss": 0.911,
1608
+ "step": 1930
1609
+ },
1610
+ {
1611
+ "epoch": 2.415940224159402,
1612
+ "grad_norm": 9.75,
1613
+ "learning_rate": 8.956666666666667e-05,
1614
+ "loss": 0.9486,
1615
+ "step": 1940
1616
+ },
1617
+ {
1618
+ "epoch": 2.428393524283935,
1619
+ "grad_norm": 9.25,
1620
+ "learning_rate": 8.945555555555556e-05,
1621
+ "loss": 0.8775,
1622
+ "step": 1950
1623
+ },
1624
+ {
1625
+ "epoch": 2.4408468244084682,
1626
+ "grad_norm": 8.0,
1627
+ "learning_rate": 8.934444444444445e-05,
1628
+ "loss": 0.8373,
1629
+ "step": 1960
1630
+ },
1631
+ {
1632
+ "epoch": 2.4533001245330013,
1633
+ "grad_norm": 7.625,
1634
+ "learning_rate": 8.923333333333334e-05,
1635
+ "loss": 0.7469,
1636
+ "step": 1970
1637
+ },
1638
+ {
1639
+ "epoch": 2.4657534246575343,
1640
+ "grad_norm": 37.75,
1641
+ "learning_rate": 8.912222222222222e-05,
1642
+ "loss": 0.7934,
1643
+ "step": 1980
1644
+ },
1645
+ {
1646
+ "epoch": 2.4782067247820674,
1647
+ "grad_norm": 9.125,
1648
+ "learning_rate": 8.901111111111111e-05,
1649
+ "loss": 0.7733,
1650
+ "step": 1990
1651
+ },
1652
+ {
1653
+ "epoch": 2.4906600249066004,
1654
+ "grad_norm": 8.8125,
1655
+ "learning_rate": 8.89e-05,
1656
+ "loss": 0.7488,
1657
+ "step": 2000
1658
+ },
1659
+ {
1660
+ "epoch": 2.4906600249066004,
1661
+ "eval/acc": 48.83720779418945,
1662
+ "step": 2000
1663
+ },
1664
+ {
1665
+ "epoch": 2.4906600249066004,
1666
+ "eval_loss": 1.8490980863571167,
1667
+ "eval_runtime": 0.2184,
1668
+ "eval_samples_per_second": 196.883,
1669
+ "eval_steps_per_second": 4.579,
1670
+ "step": 2000
1671
+ },
1672
+ {
1673
+ "epoch": 2.5031133250311335,
1674
+ "grad_norm": 8.0,
1675
+ "learning_rate": 8.878888888888889e-05,
1676
+ "loss": 0.8461,
1677
+ "step": 2010
1678
+ },
1679
+ {
1680
+ "epoch": 2.515566625155666,
1681
+ "grad_norm": 8.75,
1682
+ "learning_rate": 8.867777777777778e-05,
1683
+ "loss": 0.7647,
1684
+ "step": 2020
1685
+ },
1686
+ {
1687
+ "epoch": 2.528019925280199,
1688
+ "grad_norm": 8.8125,
1689
+ "learning_rate": 8.856666666666667e-05,
1690
+ "loss": 0.796,
1691
+ "step": 2030
1692
+ },
1693
+ {
1694
+ "epoch": 2.540473225404732,
1695
+ "grad_norm": 7.78125,
1696
+ "learning_rate": 8.845555555555556e-05,
1697
+ "loss": 0.7758,
1698
+ "step": 2040
1699
+ },
1700
+ {
1701
+ "epoch": 2.5529265255292652,
1702
+ "grad_norm": 7.75,
1703
+ "learning_rate": 8.834444444444446e-05,
1704
+ "loss": 0.7753,
1705
+ "step": 2050
1706
+ },
1707
+ {
1708
+ "epoch": 2.5653798256537983,
1709
+ "grad_norm": 8.9375,
1710
+ "learning_rate": 8.823333333333334e-05,
1711
+ "loss": 0.6914,
1712
+ "step": 2060
1713
+ },
1714
+ {
1715
+ "epoch": 2.5778331257783313,
1716
+ "grad_norm": 9.4375,
1717
+ "learning_rate": 8.812222222222223e-05,
1718
+ "loss": 0.787,
1719
+ "step": 2070
1720
+ },
1721
+ {
1722
+ "epoch": 2.5902864259028644,
1723
+ "grad_norm": 8.125,
1724
+ "learning_rate": 8.801111111111111e-05,
1725
+ "loss": 0.7742,
1726
+ "step": 2080
1727
+ },
1728
+ {
1729
+ "epoch": 2.602739726027397,
1730
+ "grad_norm": 10.6875,
1731
+ "learning_rate": 8.790000000000001e-05,
1732
+ "loss": 0.7528,
1733
+ "step": 2090
1734
+ },
1735
+ {
1736
+ "epoch": 2.61519302615193,
1737
+ "grad_norm": 9.1875,
1738
+ "learning_rate": 8.77888888888889e-05,
1739
+ "loss": 0.7392,
1740
+ "step": 2100
1741
+ },
1742
+ {
1743
+ "epoch": 2.61519302615193,
1744
+ "eval/acc": 46.511627197265625,
1745
+ "step": 2100
1746
+ },
1747
+ {
1748
+ "epoch": 2.61519302615193,
1749
+ "eval_loss": 1.9725399017333984,
1750
+ "eval_runtime": 0.214,
1751
+ "eval_samples_per_second": 200.925,
1752
+ "eval_steps_per_second": 4.673,
1753
+ "step": 2100
1754
+ },
1755
+ {
1756
+ "epoch": 2.627646326276463,
1757
+ "grad_norm": 9.375,
1758
+ "learning_rate": 8.767777777777778e-05,
1759
+ "loss": 0.7993,
1760
+ "step": 2110
1761
+ },
1762
+ {
1763
+ "epoch": 2.640099626400996,
1764
+ "grad_norm": 9.0625,
1765
+ "learning_rate": 8.756666666666668e-05,
1766
+ "loss": 0.854,
1767
+ "step": 2120
1768
+ },
1769
+ {
1770
+ "epoch": 2.652552926525529,
1771
+ "grad_norm": 10.625,
1772
+ "learning_rate": 8.745555555555556e-05,
1773
+ "loss": 0.8887,
1774
+ "step": 2130
1775
+ },
1776
+ {
1777
+ "epoch": 2.6650062266500623,
1778
+ "grad_norm": 7.75,
1779
+ "learning_rate": 8.734444444444445e-05,
1780
+ "loss": 0.7407,
1781
+ "step": 2140
1782
+ },
1783
+ {
1784
+ "epoch": 2.6774595267745953,
1785
+ "grad_norm": 10.75,
1786
+ "learning_rate": 8.723333333333333e-05,
1787
+ "loss": 0.9187,
1788
+ "step": 2150
1789
+ },
1790
+ {
1791
+ "epoch": 2.6899128268991284,
1792
+ "grad_norm": 7.71875,
1793
+ "learning_rate": 8.712222222222223e-05,
1794
+ "loss": 0.7804,
1795
+ "step": 2160
1796
+ },
1797
+ {
1798
+ "epoch": 2.7023661270236614,
1799
+ "grad_norm": 7.34375,
1800
+ "learning_rate": 8.701111111111111e-05,
1801
+ "loss": 0.7368,
1802
+ "step": 2170
1803
+ },
1804
+ {
1805
+ "epoch": 2.7148194271481945,
1806
+ "grad_norm": 10.0625,
1807
+ "learning_rate": 8.69e-05,
1808
+ "loss": 0.7027,
1809
+ "step": 2180
1810
+ },
1811
+ {
1812
+ "epoch": 2.7272727272727275,
1813
+ "grad_norm": 12.875,
1814
+ "learning_rate": 8.67888888888889e-05,
1815
+ "loss": 0.8305,
1816
+ "step": 2190
1817
+ },
1818
+ {
1819
+ "epoch": 2.73972602739726,
1820
+ "grad_norm": 9.125,
1821
+ "learning_rate": 8.667777777777778e-05,
1822
+ "loss": 0.7767,
1823
+ "step": 2200
1824
+ },
1825
+ {
1826
+ "epoch": 2.73972602739726,
1827
+ "eval/acc": 48.83720779418945,
1828
+ "step": 2200
1829
+ },
1830
+ {
1831
+ "epoch": 2.73972602739726,
1832
+ "eval_loss": 1.8356798887252808,
1833
+ "eval_runtime": 0.2116,
1834
+ "eval_samples_per_second": 203.247,
1835
+ "eval_steps_per_second": 4.727,
1836
+ "step": 2200
1837
+ },
1838
+ {
1839
+ "epoch": 2.752179327521793,
1840
+ "grad_norm": 8.4375,
1841
+ "learning_rate": 8.656666666666668e-05,
1842
+ "loss": 0.7547,
1843
+ "step": 2210
1844
+ },
1845
+ {
1846
+ "epoch": 2.7646326276463262,
1847
+ "grad_norm": 7.5,
1848
+ "learning_rate": 8.645555555555555e-05,
1849
+ "loss": 0.8497,
1850
+ "step": 2220
1851
+ },
1852
+ {
1853
+ "epoch": 2.7770859277708593,
1854
+ "grad_norm": 10.0625,
1855
+ "learning_rate": 8.634444444444445e-05,
1856
+ "loss": 0.8024,
1857
+ "step": 2230
1858
+ },
1859
+ {
1860
+ "epoch": 2.7895392278953923,
1861
+ "grad_norm": 13.5,
1862
+ "learning_rate": 8.623333333333333e-05,
1863
+ "loss": 0.7806,
1864
+ "step": 2240
1865
+ },
1866
+ {
1867
+ "epoch": 2.8019925280199254,
1868
+ "grad_norm": 10.8125,
1869
+ "learning_rate": 8.612222222222223e-05,
1870
+ "loss": 0.7021,
1871
+ "step": 2250
1872
+ },
1873
+ {
1874
+ "epoch": 2.8144458281444584,
1875
+ "grad_norm": 9.3125,
1876
+ "learning_rate": 8.601111111111112e-05,
1877
+ "loss": 0.72,
1878
+ "step": 2260
1879
+ },
1880
+ {
1881
+ "epoch": 2.826899128268991,
1882
+ "grad_norm": 8.875,
1883
+ "learning_rate": 8.59e-05,
1884
+ "loss": 0.8063,
1885
+ "step": 2270
1886
+ },
1887
+ {
1888
+ "epoch": 2.839352428393524,
1889
+ "grad_norm": 8.75,
1890
+ "learning_rate": 8.57888888888889e-05,
1891
+ "loss": 0.8264,
1892
+ "step": 2280
1893
+ },
1894
+ {
1895
+ "epoch": 2.851805728518057,
1896
+ "grad_norm": 8.75,
1897
+ "learning_rate": 8.567777777777778e-05,
1898
+ "loss": 0.814,
1899
+ "step": 2290
1900
+ },
1901
+ {
1902
+ "epoch": 2.86425902864259,
1903
+ "grad_norm": 10.25,
1904
+ "learning_rate": 8.556666666666667e-05,
1905
+ "loss": 0.7985,
1906
+ "step": 2300
1907
+ },
1908
+ {
1909
+ "epoch": 2.86425902864259,
1910
+ "eval/acc": 51.16279220581055,
1911
+ "step": 2300
1912
+ },
1913
+ {
1914
+ "epoch": 2.86425902864259,
1915
+ "eval_loss": 1.9056586027145386,
1916
+ "eval_runtime": 0.221,
1917
+ "eval_samples_per_second": 194.606,
1918
+ "eval_steps_per_second": 4.526,
1919
+ "step": 2300
1920
+ },
1921
+ {
1922
+ "epoch": 2.8767123287671232,
1923
+ "grad_norm": 8.6875,
1924
+ "learning_rate": 8.545555555555555e-05,
1925
+ "loss": 0.7489,
1926
+ "step": 2310
1927
+ },
1928
+ {
1929
+ "epoch": 2.8891656288916563,
1930
+ "grad_norm": 9.25,
1931
+ "learning_rate": 8.534444444444445e-05,
1932
+ "loss": 0.8398,
1933
+ "step": 2320
1934
+ },
1935
+ {
1936
+ "epoch": 2.9016189290161893,
1937
+ "grad_norm": 8.8125,
1938
+ "learning_rate": 8.523333333333334e-05,
1939
+ "loss": 0.7808,
1940
+ "step": 2330
1941
+ },
1942
+ {
1943
+ "epoch": 2.9140722291407224,
1944
+ "grad_norm": 8.625,
1945
+ "learning_rate": 8.512222222222222e-05,
1946
+ "loss": 0.8163,
1947
+ "step": 2340
1948
+ },
1949
+ {
1950
+ "epoch": 2.9265255292652554,
1951
+ "grad_norm": 13.9375,
1952
+ "learning_rate": 8.501111111111112e-05,
1953
+ "loss": 0.8038,
1954
+ "step": 2350
1955
+ },
1956
+ {
1957
+ "epoch": 2.9389788293897885,
1958
+ "grad_norm": 11.8125,
1959
+ "learning_rate": 8.49e-05,
1960
+ "loss": 0.7362,
1961
+ "step": 2360
1962
+ },
1963
+ {
1964
+ "epoch": 2.9514321295143215,
1965
+ "grad_norm": 12.0625,
1966
+ "learning_rate": 8.47888888888889e-05,
1967
+ "loss": 0.8096,
1968
+ "step": 2370
1969
+ },
1970
+ {
1971
+ "epoch": 2.963885429638854,
1972
+ "grad_norm": 10.4375,
1973
+ "learning_rate": 8.467777777777777e-05,
1974
+ "loss": 0.7728,
1975
+ "step": 2380
1976
+ },
1977
+ {
1978
+ "epoch": 2.976338729763387,
1979
+ "grad_norm": 11.875,
1980
+ "learning_rate": 8.456666666666667e-05,
1981
+ "loss": 0.8224,
1982
+ "step": 2390
1983
+ },
1984
+ {
1985
+ "epoch": 2.9887920298879203,
1986
+ "grad_norm": 8.375,
1987
+ "learning_rate": 8.445555555555556e-05,
1988
+ "loss": 0.8418,
1989
+ "step": 2400
1990
+ },
1991
+ {
1992
+ "epoch": 2.9887920298879203,
1993
+ "eval/acc": 46.511627197265625,
1994
+ "step": 2400
1995
+ },
1996
+ {
1997
+ "epoch": 2.9887920298879203,
1998
+ "eval_loss": 1.9594019651412964,
1999
+ "eval_runtime": 0.2169,
2000
+ "eval_samples_per_second": 198.209,
2001
+ "eval_steps_per_second": 4.61,
2002
+ "step": 2400
2003
+ },
2004
+ {
2005
+ "epoch": 3.0012453300124533,
2006
+ "grad_norm": 6.78125,
2007
+ "learning_rate": 8.434444444444445e-05,
2008
+ "loss": 0.7483,
2009
+ "step": 2410
2010
+ },
2011
+ {
2012
+ "epoch": 3.0136986301369864,
2013
+ "grad_norm": 8.75,
2014
+ "learning_rate": 8.423333333333334e-05,
2015
+ "loss": 0.7396,
2016
+ "step": 2420
2017
+ },
2018
+ {
2019
+ "epoch": 3.0261519302615194,
2020
+ "grad_norm": 15.6875,
2021
+ "learning_rate": 8.412222222222222e-05,
2022
+ "loss": 0.7436,
2023
+ "step": 2430
2024
+ },
2025
+ {
2026
+ "epoch": 3.0386052303860525,
2027
+ "grad_norm": 8.5625,
2028
+ "learning_rate": 8.401111111111112e-05,
2029
+ "loss": 0.6092,
2030
+ "step": 2440
2031
+ },
2032
+ {
2033
+ "epoch": 3.0510585305105855,
2034
+ "grad_norm": 11.0,
2035
+ "learning_rate": 8.39e-05,
2036
+ "loss": 0.7142,
2037
+ "step": 2450
2038
+ },
2039
+ {
2040
+ "epoch": 3.063511830635118,
2041
+ "grad_norm": 11.8125,
2042
+ "learning_rate": 8.378888888888889e-05,
2043
+ "loss": 0.692,
2044
+ "step": 2460
2045
+ },
2046
+ {
2047
+ "epoch": 3.075965130759651,
2048
+ "grad_norm": 10.8125,
2049
+ "learning_rate": 8.367777777777778e-05,
2050
+ "loss": 0.672,
2051
+ "step": 2470
2052
+ },
2053
+ {
2054
+ "epoch": 3.088418430884184,
2055
+ "grad_norm": 9.0625,
2056
+ "learning_rate": 8.356666666666667e-05,
2057
+ "loss": 0.6947,
2058
+ "step": 2480
2059
+ },
2060
+ {
2061
+ "epoch": 3.1008717310087173,
2062
+ "grad_norm": 9.0625,
2063
+ "learning_rate": 8.345555555555556e-05,
2064
+ "loss": 0.7188,
2065
+ "step": 2490
2066
+ },
2067
+ {
2068
+ "epoch": 3.1133250311332503,
2069
+ "grad_norm": 7.125,
2070
+ "learning_rate": 8.334444444444444e-05,
2071
+ "loss": 0.6621,
2072
+ "step": 2500
2073
+ },
2074
+ {
2075
+ "epoch": 3.1133250311332503,
2076
+ "eval/acc": 41.86046600341797,
2077
+ "step": 2500
2078
+ },
2079
+ {
2080
+ "epoch": 3.1133250311332503,
2081
+ "eval_loss": 2.2108497619628906,
2082
+ "eval_runtime": 2.0608,
2083
+ "eval_samples_per_second": 20.866,
2084
+ "eval_steps_per_second": 0.485,
2085
+ "step": 2500
2086
+ }
2087
+ ],
2088
+ "logging_steps": 10,
2089
+ "max_steps": 10000,
2090
+ "num_input_tokens_seen": 0,
2091
+ "num_train_epochs": 13,
2092
+ "save_steps": 2500,
2093
+ "stateful_callbacks": {
2094
+ "TrainerControl": {
2095
+ "args": {
2096
+ "should_epoch_stop": false,
2097
+ "should_evaluate": false,
2098
+ "should_log": false,
2099
+ "should_save": true,
2100
+ "should_training_stop": false
2101
+ },
2102
+ "attributes": {}
2103
+ }
2104
+ },
2105
+ "total_flos": 0.0,
2106
+ "train_batch_size": 16,
2107
+ "trial_name": null,
2108
+ "trial_params": null
2109
+ }
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b297d5e64c898db22dbb0b2c7feb17b604dfbcc3bfebf55e9e7ecbf9c3794c
3
+ size 6161
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c8ced7bda17e694a323b59668f51b10dbaf3dd3577d631459b9cb69ef78adb7
3
+ size 298041696
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edb64b8cef0a3dfc25477198f91d7185d0407eac4e6e1ff0b31a40675c252898
3
+ size 596170443
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e02ca9ae115ea16fec032391633efd7a900f47635f27c79cf4d01a0dec960d3
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98fc86822b150867dd155dd03d026ce3dd7af59775e2a5feacb7751718cd127c
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15e20421b6d20eab5ea415631e2b4770e15ae33eba8329cddc9b7141c145aee0
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c654a7dfacad68a4b4444888a91f6df0461c57090ffb3da9c95f3de4477f1988
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:506f6e39bd983d811639cf9d5aea75be4643e6c5adeffc1e40a2ab6e23817ea8
3
+ size 1465
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json ADDED
@@ -0,0 +1,4184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 6.226650062266501,
6
+ "eval_steps": 100,
7
+ "global_step": 5000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.012453300124533,
14
+ "grad_norm": 15.75,
15
+ "learning_rate": 9e-07,
16
+ "loss": 5.5753,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.024906600249066,
21
+ "grad_norm": 25.25,
22
+ "learning_rate": 1.9e-06,
23
+ "loss": 5.5776,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.037359900373599,
28
+ "grad_norm": 16.25,
29
+ "learning_rate": 2.9e-06,
30
+ "loss": 5.5572,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.049813200498132,
35
+ "grad_norm": 23.5,
36
+ "learning_rate": 3.9e-06,
37
+ "loss": 5.5201,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.062266500622665005,
42
+ "grad_norm": 18.0,
43
+ "learning_rate": 4.9000000000000005e-06,
44
+ "loss": 5.6297,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.074719800747198,
49
+ "grad_norm": 19.125,
50
+ "learning_rate": 5.9e-06,
51
+ "loss": 5.5889,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.08717310087173101,
56
+ "grad_norm": 23.125,
57
+ "learning_rate": 6.900000000000001e-06,
58
+ "loss": 5.4949,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.099626400996264,
63
+ "grad_norm": 20.25,
64
+ "learning_rate": 7.9e-06,
65
+ "loss": 5.552,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.11207970112079702,
70
+ "grad_norm": 17.0,
71
+ "learning_rate": 8.9e-06,
72
+ "loss": 5.4765,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.12453300124533001,
77
+ "grad_norm": 20.125,
78
+ "learning_rate": 9.900000000000002e-06,
79
+ "loss": 5.4519,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.12453300124533001,
84
+ "eval/acc": 2.3255813121795654,
85
+ "step": 100
86
+ },
87
+ {
88
+ "epoch": 0.12453300124533001,
89
+ "eval_loss": 4.970202445983887,
90
+ "eval_runtime": 2.822,
91
+ "eval_samples_per_second": 15.237,
92
+ "eval_steps_per_second": 0.354,
93
+ "step": 100
94
+ },
95
+ {
96
+ "epoch": 0.136986301369863,
97
+ "grad_norm": 22.125,
98
+ "learning_rate": 1.09e-05,
99
+ "loss": 5.3401,
100
+ "step": 110
101
+ },
102
+ {
103
+ "epoch": 0.149439601494396,
104
+ "grad_norm": 16.25,
105
+ "learning_rate": 1.19e-05,
106
+ "loss": 5.3088,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 0.16189290161892902,
111
+ "grad_norm": 18.75,
112
+ "learning_rate": 1.29e-05,
113
+ "loss": 5.1442,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.17434620174346202,
118
+ "grad_norm": 19.5,
119
+ "learning_rate": 1.3900000000000002e-05,
120
+ "loss": 5.0218,
121
+ "step": 140
122
+ },
123
+ {
124
+ "epoch": 0.18679950186799502,
125
+ "grad_norm": 25.75,
126
+ "learning_rate": 1.49e-05,
127
+ "loss": 4.8711,
128
+ "step": 150
129
+ },
130
+ {
131
+ "epoch": 0.199252801992528,
132
+ "grad_norm": 25.625,
133
+ "learning_rate": 1.59e-05,
134
+ "loss": 4.6046,
135
+ "step": 160
136
+ },
137
+ {
138
+ "epoch": 0.21170610211706103,
139
+ "grad_norm": 28.25,
140
+ "learning_rate": 1.69e-05,
141
+ "loss": 4.2891,
142
+ "step": 170
143
+ },
144
+ {
145
+ "epoch": 0.22415940224159403,
146
+ "grad_norm": 25.25,
147
+ "learning_rate": 1.79e-05,
148
+ "loss": 3.8055,
149
+ "step": 180
150
+ },
151
+ {
152
+ "epoch": 0.23661270236612703,
153
+ "grad_norm": 28.0,
154
+ "learning_rate": 1.8900000000000002e-05,
155
+ "loss": 3.4139,
156
+ "step": 190
157
+ },
158
+ {
159
+ "epoch": 0.24906600249066002,
160
+ "grad_norm": 29.5,
161
+ "learning_rate": 1.9900000000000003e-05,
162
+ "loss": 2.974,
163
+ "step": 200
164
+ },
165
+ {
166
+ "epoch": 0.24906600249066002,
167
+ "eval/acc": 11.627906799316406,
168
+ "step": 200
169
+ },
170
+ {
171
+ "epoch": 0.24906600249066002,
172
+ "eval_loss": 3.7072134017944336,
173
+ "eval_runtime": 0.2742,
174
+ "eval_samples_per_second": 156.807,
175
+ "eval_steps_per_second": 3.647,
176
+ "step": 200
177
+ },
178
+ {
179
+ "epoch": 0.261519302615193,
180
+ "grad_norm": 30.5,
181
+ "learning_rate": 2.09e-05,
182
+ "loss": 2.8723,
183
+ "step": 210
184
+ },
185
+ {
186
+ "epoch": 0.273972602739726,
187
+ "grad_norm": 19.625,
188
+ "learning_rate": 2.19e-05,
189
+ "loss": 2.6908,
190
+ "step": 220
191
+ },
192
+ {
193
+ "epoch": 0.286425902864259,
194
+ "grad_norm": 18.25,
195
+ "learning_rate": 2.29e-05,
196
+ "loss": 2.4715,
197
+ "step": 230
198
+ },
199
+ {
200
+ "epoch": 0.298879202988792,
201
+ "grad_norm": 16.75,
202
+ "learning_rate": 2.39e-05,
203
+ "loss": 2.4336,
204
+ "step": 240
205
+ },
206
+ {
207
+ "epoch": 0.31133250311332505,
208
+ "grad_norm": 16.875,
209
+ "learning_rate": 2.4900000000000002e-05,
210
+ "loss": 2.3797,
211
+ "step": 250
212
+ },
213
+ {
214
+ "epoch": 0.32378580323785805,
215
+ "grad_norm": 18.375,
216
+ "learning_rate": 2.5900000000000003e-05,
217
+ "loss": 2.2765,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 0.33623910336239105,
222
+ "grad_norm": 13.625,
223
+ "learning_rate": 2.6900000000000003e-05,
224
+ "loss": 2.1124,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 0.34869240348692404,
229
+ "grad_norm": 19.5,
230
+ "learning_rate": 2.7900000000000004e-05,
231
+ "loss": 2.0748,
232
+ "step": 280
233
+ },
234
+ {
235
+ "epoch": 0.36114570361145704,
236
+ "grad_norm": 18.0,
237
+ "learning_rate": 2.8899999999999998e-05,
238
+ "loss": 2.1575,
239
+ "step": 290
240
+ },
241
+ {
242
+ "epoch": 0.37359900373599003,
243
+ "grad_norm": 34.0,
244
+ "learning_rate": 2.9900000000000002e-05,
245
+ "loss": 2.1195,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 0.37359900373599003,
250
+ "eval/acc": 23.255813598632812,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 0.37359900373599003,
255
+ "eval_loss": 3.1418063640594482,
256
+ "eval_runtime": 1.1652,
257
+ "eval_samples_per_second": 36.903,
258
+ "eval_steps_per_second": 0.858,
259
+ "step": 300
260
+ },
261
+ {
262
+ "epoch": 0.386052303860523,
263
+ "grad_norm": 19.125,
264
+ "learning_rate": 3.09e-05,
265
+ "loss": 2.1052,
266
+ "step": 310
267
+ },
268
+ {
269
+ "epoch": 0.398505603985056,
270
+ "grad_norm": 20.375,
271
+ "learning_rate": 3.19e-05,
272
+ "loss": 1.8924,
273
+ "step": 320
274
+ },
275
+ {
276
+ "epoch": 0.410958904109589,
277
+ "grad_norm": 17.125,
278
+ "learning_rate": 3.29e-05,
279
+ "loss": 2.025,
280
+ "step": 330
281
+ },
282
+ {
283
+ "epoch": 0.42341220423412207,
284
+ "grad_norm": 28.0,
285
+ "learning_rate": 3.3900000000000004e-05,
286
+ "loss": 1.8914,
287
+ "step": 340
288
+ },
289
+ {
290
+ "epoch": 0.43586550435865506,
291
+ "grad_norm": 22.125,
292
+ "learning_rate": 3.49e-05,
293
+ "loss": 1.8864,
294
+ "step": 350
295
+ },
296
+ {
297
+ "epoch": 0.44831880448318806,
298
+ "grad_norm": 34.0,
299
+ "learning_rate": 3.59e-05,
300
+ "loss": 1.8447,
301
+ "step": 360
302
+ },
303
+ {
304
+ "epoch": 0.46077210460772106,
305
+ "grad_norm": 15.4375,
306
+ "learning_rate": 3.69e-05,
307
+ "loss": 1.7981,
308
+ "step": 370
309
+ },
310
+ {
311
+ "epoch": 0.47322540473225405,
312
+ "grad_norm": 39.25,
313
+ "learning_rate": 3.79e-05,
314
+ "loss": 1.6967,
315
+ "step": 380
316
+ },
317
+ {
318
+ "epoch": 0.48567870485678705,
319
+ "grad_norm": 35.25,
320
+ "learning_rate": 3.8900000000000004e-05,
321
+ "loss": 1.7919,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 0.49813200498132004,
326
+ "grad_norm": 19.875,
327
+ "learning_rate": 3.99e-05,
328
+ "loss": 1.6083,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 0.49813200498132004,
333
+ "eval/acc": 27.9069766998291,
334
+ "step": 400
335
+ },
336
+ {
337
+ "epoch": 0.49813200498132004,
338
+ "eval_loss": 2.988025665283203,
339
+ "eval_runtime": 0.2197,
340
+ "eval_samples_per_second": 195.684,
341
+ "eval_steps_per_second": 4.551,
342
+ "step": 400
343
+ },
344
+ {
345
+ "epoch": 0.5105853051058531,
346
+ "grad_norm": 14.1875,
347
+ "learning_rate": 4.09e-05,
348
+ "loss": 1.7039,
349
+ "step": 410
350
+ },
351
+ {
352
+ "epoch": 0.523038605230386,
353
+ "grad_norm": 33.25,
354
+ "learning_rate": 4.19e-05,
355
+ "loss": 1.7057,
356
+ "step": 420
357
+ },
358
+ {
359
+ "epoch": 0.5354919053549191,
360
+ "grad_norm": 15.5,
361
+ "learning_rate": 4.29e-05,
362
+ "loss": 1.6425,
363
+ "step": 430
364
+ },
365
+ {
366
+ "epoch": 0.547945205479452,
367
+ "grad_norm": 29.625,
368
+ "learning_rate": 4.39e-05,
369
+ "loss": 1.4995,
370
+ "step": 440
371
+ },
372
+ {
373
+ "epoch": 0.5603985056039851,
374
+ "grad_norm": 15.0625,
375
+ "learning_rate": 4.49e-05,
376
+ "loss": 1.6621,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 0.572851805728518,
381
+ "grad_norm": 22.25,
382
+ "learning_rate": 4.5900000000000004e-05,
383
+ "loss": 1.5684,
384
+ "step": 460
385
+ },
386
+ {
387
+ "epoch": 0.5853051058530511,
388
+ "grad_norm": 17.25,
389
+ "learning_rate": 4.69e-05,
390
+ "loss": 1.5414,
391
+ "step": 470
392
+ },
393
+ {
394
+ "epoch": 0.597758405977584,
395
+ "grad_norm": 19.25,
396
+ "learning_rate": 4.79e-05,
397
+ "loss": 1.5445,
398
+ "step": 480
399
+ },
400
+ {
401
+ "epoch": 0.6102117061021171,
402
+ "grad_norm": 205.0,
403
+ "learning_rate": 4.89e-05,
404
+ "loss": 1.4726,
405
+ "step": 490
406
+ },
407
+ {
408
+ "epoch": 0.6226650062266501,
409
+ "grad_norm": 13.375,
410
+ "learning_rate": 4.99e-05,
411
+ "loss": 1.3783,
412
+ "step": 500
413
+ },
414
+ {
415
+ "epoch": 0.6226650062266501,
416
+ "eval/acc": 30.23255729675293,
417
+ "step": 500
418
+ },
419
+ {
420
+ "epoch": 0.6226650062266501,
421
+ "eval_loss": 2.777118444442749,
422
+ "eval_runtime": 0.2153,
423
+ "eval_samples_per_second": 199.749,
424
+ "eval_steps_per_second": 4.645,
425
+ "step": 500
426
+ },
427
+ {
428
+ "epoch": 0.635118306351183,
429
+ "grad_norm": 20.875,
430
+ "learning_rate": 5.0900000000000004e-05,
431
+ "loss": 1.4983,
432
+ "step": 510
433
+ },
434
+ {
435
+ "epoch": 0.6475716064757161,
436
+ "grad_norm": 16.625,
437
+ "learning_rate": 5.19e-05,
438
+ "loss": 1.5093,
439
+ "step": 520
440
+ },
441
+ {
442
+ "epoch": 0.660024906600249,
443
+ "grad_norm": 14.125,
444
+ "learning_rate": 5.2900000000000005e-05,
445
+ "loss": 1.4588,
446
+ "step": 530
447
+ },
448
+ {
449
+ "epoch": 0.6724782067247821,
450
+ "grad_norm": 37.0,
451
+ "learning_rate": 5.390000000000001e-05,
452
+ "loss": 1.4346,
453
+ "step": 540
454
+ },
455
+ {
456
+ "epoch": 0.684931506849315,
457
+ "grad_norm": 16.75,
458
+ "learning_rate": 5.4900000000000006e-05,
459
+ "loss": 1.5363,
460
+ "step": 550
461
+ },
462
+ {
463
+ "epoch": 0.6973848069738481,
464
+ "grad_norm": 28.375,
465
+ "learning_rate": 5.590000000000001e-05,
466
+ "loss": 1.4497,
467
+ "step": 560
468
+ },
469
+ {
470
+ "epoch": 0.709838107098381,
471
+ "grad_norm": 15.5625,
472
+ "learning_rate": 5.69e-05,
473
+ "loss": 1.4005,
474
+ "step": 570
475
+ },
476
+ {
477
+ "epoch": 0.7222914072229141,
478
+ "grad_norm": 14.75,
479
+ "learning_rate": 5.79e-05,
480
+ "loss": 1.4588,
481
+ "step": 580
482
+ },
483
+ {
484
+ "epoch": 0.7347447073474471,
485
+ "grad_norm": 18.5,
486
+ "learning_rate": 5.89e-05,
487
+ "loss": 1.3489,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 0.7471980074719801,
492
+ "grad_norm": 12.125,
493
+ "learning_rate": 5.99e-05,
494
+ "loss": 1.3295,
495
+ "step": 600
496
+ },
497
+ {
498
+ "epoch": 0.7471980074719801,
499
+ "eval/acc": 39.53488540649414,
500
+ "step": 600
501
+ },
502
+ {
503
+ "epoch": 0.7471980074719801,
504
+ "eval_loss": 2.6652462482452393,
505
+ "eval_runtime": 0.2211,
506
+ "eval_samples_per_second": 194.477,
507
+ "eval_steps_per_second": 4.523,
508
+ "step": 600
509
+ },
510
+ {
511
+ "epoch": 0.7596513075965131,
512
+ "grad_norm": 12.9375,
513
+ "learning_rate": 6.09e-05,
514
+ "loss": 1.3717,
515
+ "step": 610
516
+ },
517
+ {
518
+ "epoch": 0.772104607721046,
519
+ "grad_norm": 21.5,
520
+ "learning_rate": 6.19e-05,
521
+ "loss": 1.425,
522
+ "step": 620
523
+ },
524
+ {
525
+ "epoch": 0.7845579078455791,
526
+ "grad_norm": 13.6875,
527
+ "learning_rate": 6.29e-05,
528
+ "loss": 1.3017,
529
+ "step": 630
530
+ },
531
+ {
532
+ "epoch": 0.797011207970112,
533
+ "grad_norm": 12.8125,
534
+ "learning_rate": 6.390000000000001e-05,
535
+ "loss": 1.3533,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 0.8094645080946451,
540
+ "grad_norm": 13.1875,
541
+ "learning_rate": 6.49e-05,
542
+ "loss": 1.271,
543
+ "step": 650
544
+ },
545
+ {
546
+ "epoch": 0.821917808219178,
547
+ "grad_norm": 15.125,
548
+ "learning_rate": 6.59e-05,
549
+ "loss": 1.3734,
550
+ "step": 660
551
+ },
552
+ {
553
+ "epoch": 0.8343711083437111,
554
+ "grad_norm": 16.125,
555
+ "learning_rate": 6.690000000000001e-05,
556
+ "loss": 1.3092,
557
+ "step": 670
558
+ },
559
+ {
560
+ "epoch": 0.8468244084682441,
561
+ "grad_norm": 17.75,
562
+ "learning_rate": 6.790000000000001e-05,
563
+ "loss": 1.1803,
564
+ "step": 680
565
+ },
566
+ {
567
+ "epoch": 0.8592777085927771,
568
+ "grad_norm": 13.875,
569
+ "learning_rate": 6.89e-05,
570
+ "loss": 1.3383,
571
+ "step": 690
572
+ },
573
+ {
574
+ "epoch": 0.8717310087173101,
575
+ "grad_norm": 11.25,
576
+ "learning_rate": 6.99e-05,
577
+ "loss": 1.3024,
578
+ "step": 700
579
+ },
580
+ {
581
+ "epoch": 0.8717310087173101,
582
+ "eval/acc": 34.88372039794922,
583
+ "step": 700
584
+ },
585
+ {
586
+ "epoch": 0.8717310087173101,
587
+ "eval_loss": 2.7215068340301514,
588
+ "eval_runtime": 0.3836,
589
+ "eval_samples_per_second": 112.097,
590
+ "eval_steps_per_second": 2.607,
591
+ "step": 700
592
+ },
593
+ {
594
+ "epoch": 0.8841843088418431,
595
+ "grad_norm": 67.0,
596
+ "learning_rate": 7.09e-05,
597
+ "loss": 1.2095,
598
+ "step": 710
599
+ },
600
+ {
601
+ "epoch": 0.8966376089663761,
602
+ "grad_norm": 9.875,
603
+ "learning_rate": 7.19e-05,
604
+ "loss": 1.2948,
605
+ "step": 720
606
+ },
607
+ {
608
+ "epoch": 0.9090909090909091,
609
+ "grad_norm": 14.1875,
610
+ "learning_rate": 7.29e-05,
611
+ "loss": 1.3225,
612
+ "step": 730
613
+ },
614
+ {
615
+ "epoch": 0.9215442092154421,
616
+ "grad_norm": 13.125,
617
+ "learning_rate": 7.390000000000001e-05,
618
+ "loss": 1.1936,
619
+ "step": 740
620
+ },
621
+ {
622
+ "epoch": 0.933997509339975,
623
+ "grad_norm": 12.875,
624
+ "learning_rate": 7.49e-05,
625
+ "loss": 1.2211,
626
+ "step": 750
627
+ },
628
+ {
629
+ "epoch": 0.9464508094645081,
630
+ "grad_norm": 13.5625,
631
+ "learning_rate": 7.59e-05,
632
+ "loss": 1.2435,
633
+ "step": 760
634
+ },
635
+ {
636
+ "epoch": 0.958904109589041,
637
+ "grad_norm": 19.25,
638
+ "learning_rate": 7.69e-05,
639
+ "loss": 1.1786,
640
+ "step": 770
641
+ },
642
+ {
643
+ "epoch": 0.9713574097135741,
644
+ "grad_norm": 14.375,
645
+ "learning_rate": 7.790000000000001e-05,
646
+ "loss": 1.2784,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 0.9838107098381071,
651
+ "grad_norm": 10.625,
652
+ "learning_rate": 7.890000000000001e-05,
653
+ "loss": 1.2618,
654
+ "step": 790
655
+ },
656
+ {
657
+ "epoch": 0.9962640099626401,
658
+ "grad_norm": 22.75,
659
+ "learning_rate": 7.99e-05,
660
+ "loss": 1.1601,
661
+ "step": 800
662
+ },
663
+ {
664
+ "epoch": 0.9962640099626401,
665
+ "eval/acc": 41.86046600341797,
666
+ "step": 800
667
+ },
668
+ {
669
+ "epoch": 0.9962640099626401,
670
+ "eval_loss": 2.622220516204834,
671
+ "eval_runtime": 0.2076,
672
+ "eval_samples_per_second": 207.126,
673
+ "eval_steps_per_second": 4.817,
674
+ "step": 800
675
+ },
676
+ {
677
+ "epoch": 1.0087173100871731,
678
+ "grad_norm": 12.75,
679
+ "learning_rate": 8.090000000000001e-05,
680
+ "loss": 1.0972,
681
+ "step": 810
682
+ },
683
+ {
684
+ "epoch": 1.0211706102117062,
685
+ "grad_norm": 9.6875,
686
+ "learning_rate": 8.19e-05,
687
+ "loss": 1.1999,
688
+ "step": 820
689
+ },
690
+ {
691
+ "epoch": 1.033623910336239,
692
+ "grad_norm": 11.625,
693
+ "learning_rate": 8.29e-05,
694
+ "loss": 1.1677,
695
+ "step": 830
696
+ },
697
+ {
698
+ "epoch": 1.046077210460772,
699
+ "grad_norm": 12.8125,
700
+ "learning_rate": 8.39e-05,
701
+ "loss": 1.1505,
702
+ "step": 840
703
+ },
704
+ {
705
+ "epoch": 1.0585305105853051,
706
+ "grad_norm": 11.6875,
707
+ "learning_rate": 8.49e-05,
708
+ "loss": 1.1599,
709
+ "step": 850
710
+ },
711
+ {
712
+ "epoch": 1.0709838107098382,
713
+ "grad_norm": 9.8125,
714
+ "learning_rate": 8.59e-05,
715
+ "loss": 1.1746,
716
+ "step": 860
717
+ },
718
+ {
719
+ "epoch": 1.083437110834371,
720
+ "grad_norm": 11.625,
721
+ "learning_rate": 8.69e-05,
722
+ "loss": 1.047,
723
+ "step": 870
724
+ },
725
+ {
726
+ "epoch": 1.095890410958904,
727
+ "grad_norm": 10.125,
728
+ "learning_rate": 8.790000000000001e-05,
729
+ "loss": 1.107,
730
+ "step": 880
731
+ },
732
+ {
733
+ "epoch": 1.108343711083437,
734
+ "grad_norm": 9.0,
735
+ "learning_rate": 8.89e-05,
736
+ "loss": 1.1105,
737
+ "step": 890
738
+ },
739
+ {
740
+ "epoch": 1.1207970112079702,
741
+ "grad_norm": 13.125,
742
+ "learning_rate": 8.99e-05,
743
+ "loss": 1.1848,
744
+ "step": 900
745
+ },
746
+ {
747
+ "epoch": 1.1207970112079702,
748
+ "eval/acc": 34.88372039794922,
749
+ "step": 900
750
+ },
751
+ {
752
+ "epoch": 1.1207970112079702,
753
+ "eval_loss": 2.8814988136291504,
754
+ "eval_runtime": 1.0687,
755
+ "eval_samples_per_second": 40.237,
756
+ "eval_steps_per_second": 0.936,
757
+ "step": 900
758
+ },
759
+ {
760
+ "epoch": 1.1332503113325032,
761
+ "grad_norm": 13.25,
762
+ "learning_rate": 9.090000000000001e-05,
763
+ "loss": 1.1235,
764
+ "step": 910
765
+ },
766
+ {
767
+ "epoch": 1.145703611457036,
768
+ "grad_norm": 17.625,
769
+ "learning_rate": 9.190000000000001e-05,
770
+ "loss": 1.0304,
771
+ "step": 920
772
+ },
773
+ {
774
+ "epoch": 1.158156911581569,
775
+ "grad_norm": 11.5625,
776
+ "learning_rate": 9.290000000000001e-05,
777
+ "loss": 1.0373,
778
+ "step": 930
779
+ },
780
+ {
781
+ "epoch": 1.1706102117061021,
782
+ "grad_norm": 13.25,
783
+ "learning_rate": 9.39e-05,
784
+ "loss": 1.12,
785
+ "step": 940
786
+ },
787
+ {
788
+ "epoch": 1.1830635118306352,
789
+ "grad_norm": 10.4375,
790
+ "learning_rate": 9.49e-05,
791
+ "loss": 1.0623,
792
+ "step": 950
793
+ },
794
+ {
795
+ "epoch": 1.195516811955168,
796
+ "grad_norm": 14.625,
797
+ "learning_rate": 9.59e-05,
798
+ "loss": 1.0692,
799
+ "step": 960
800
+ },
801
+ {
802
+ "epoch": 1.207970112079701,
803
+ "grad_norm": 9.6875,
804
+ "learning_rate": 9.69e-05,
805
+ "loss": 1.1914,
806
+ "step": 970
807
+ },
808
+ {
809
+ "epoch": 1.2204234122042341,
810
+ "grad_norm": 10.4375,
811
+ "learning_rate": 9.790000000000001e-05,
812
+ "loss": 1.1094,
813
+ "step": 980
814
+ },
815
+ {
816
+ "epoch": 1.2328767123287672,
817
+ "grad_norm": 9.625,
818
+ "learning_rate": 9.89e-05,
819
+ "loss": 1.0557,
820
+ "step": 990
821
+ },
822
+ {
823
+ "epoch": 1.2453300124533002,
824
+ "grad_norm": 15.75,
825
+ "learning_rate": 9.99e-05,
826
+ "loss": 0.9635,
827
+ "step": 1000
828
+ },
829
+ {
830
+ "epoch": 1.2453300124533002,
831
+ "eval/acc": 34.88372039794922,
832
+ "step": 1000
833
+ },
834
+ {
835
+ "epoch": 1.2453300124533002,
836
+ "eval_loss": 2.967315435409546,
837
+ "eval_runtime": 0.2242,
838
+ "eval_samples_per_second": 191.798,
839
+ "eval_steps_per_second": 4.46,
840
+ "step": 1000
841
+ },
842
+ {
843
+ "epoch": 1.257783312577833,
844
+ "grad_norm": 11.8125,
845
+ "learning_rate": 9.99e-05,
846
+ "loss": 1.0067,
847
+ "step": 1010
848
+ },
849
+ {
850
+ "epoch": 1.270236612702366,
851
+ "grad_norm": 11.4375,
852
+ "learning_rate": 9.97888888888889e-05,
853
+ "loss": 1.0609,
854
+ "step": 1020
855
+ },
856
+ {
857
+ "epoch": 1.2826899128268991,
858
+ "grad_norm": 12.875,
859
+ "learning_rate": 9.967777777777779e-05,
860
+ "loss": 1.1566,
861
+ "step": 1030
862
+ },
863
+ {
864
+ "epoch": 1.2951432129514322,
865
+ "grad_norm": 10.625,
866
+ "learning_rate": 9.956666666666667e-05,
867
+ "loss": 1.1045,
868
+ "step": 1040
869
+ },
870
+ {
871
+ "epoch": 1.307596513075965,
872
+ "grad_norm": 10.0625,
873
+ "learning_rate": 9.945555555555555e-05,
874
+ "loss": 1.1421,
875
+ "step": 1050
876
+ },
877
+ {
878
+ "epoch": 1.320049813200498,
879
+ "grad_norm": 11.5625,
880
+ "learning_rate": 9.934444444444445e-05,
881
+ "loss": 1.0453,
882
+ "step": 1060
883
+ },
884
+ {
885
+ "epoch": 1.3325031133250311,
886
+ "grad_norm": 11.3125,
887
+ "learning_rate": 9.923333333333334e-05,
888
+ "loss": 1.0531,
889
+ "step": 1070
890
+ },
891
+ {
892
+ "epoch": 1.3449564134495642,
893
+ "grad_norm": 11.75,
894
+ "learning_rate": 9.912222222222222e-05,
895
+ "loss": 1.0286,
896
+ "step": 1080
897
+ },
898
+ {
899
+ "epoch": 1.3574097135740972,
900
+ "grad_norm": 11.3125,
901
+ "learning_rate": 9.901111111111112e-05,
902
+ "loss": 0.9549,
903
+ "step": 1090
904
+ },
905
+ {
906
+ "epoch": 1.36986301369863,
907
+ "grad_norm": 10.5625,
908
+ "learning_rate": 9.89e-05,
909
+ "loss": 1.006,
910
+ "step": 1100
911
+ },
912
+ {
913
+ "epoch": 1.36986301369863,
914
+ "eval/acc": 34.88372039794922,
915
+ "step": 1100
916
+ },
917
+ {
918
+ "epoch": 1.36986301369863,
919
+ "eval_loss": 2.9681856632232666,
920
+ "eval_runtime": 0.2343,
921
+ "eval_samples_per_second": 183.518,
922
+ "eval_steps_per_second": 4.268,
923
+ "step": 1100
924
+ },
925
+ {
926
+ "epoch": 1.3823163138231631,
927
+ "grad_norm": 13.4375,
928
+ "learning_rate": 9.87888888888889e-05,
929
+ "loss": 1.049,
930
+ "step": 1110
931
+ },
932
+ {
933
+ "epoch": 1.3947696139476962,
934
+ "grad_norm": 13.125,
935
+ "learning_rate": 9.867777777777777e-05,
936
+ "loss": 0.951,
937
+ "step": 1120
938
+ },
939
+ {
940
+ "epoch": 1.4072229140722292,
941
+ "grad_norm": 8.6875,
942
+ "learning_rate": 9.856666666666667e-05,
943
+ "loss": 1.0806,
944
+ "step": 1130
945
+ },
946
+ {
947
+ "epoch": 1.419676214196762,
948
+ "grad_norm": 11.8125,
949
+ "learning_rate": 9.845555555555556e-05,
950
+ "loss": 0.9683,
951
+ "step": 1140
952
+ },
953
+ {
954
+ "epoch": 1.432129514321295,
955
+ "grad_norm": 14.875,
956
+ "learning_rate": 9.834444444444446e-05,
957
+ "loss": 0.977,
958
+ "step": 1150
959
+ },
960
+ {
961
+ "epoch": 1.4445828144458281,
962
+ "grad_norm": 20.125,
963
+ "learning_rate": 9.823333333333333e-05,
964
+ "loss": 0.994,
965
+ "step": 1160
966
+ },
967
+ {
968
+ "epoch": 1.4570361145703612,
969
+ "grad_norm": 11.0,
970
+ "learning_rate": 9.812222222222223e-05,
971
+ "loss": 1.037,
972
+ "step": 1170
973
+ },
974
+ {
975
+ "epoch": 1.4694894146948942,
976
+ "grad_norm": 15.5,
977
+ "learning_rate": 9.801111111111112e-05,
978
+ "loss": 1.1605,
979
+ "step": 1180
980
+ },
981
+ {
982
+ "epoch": 1.481942714819427,
983
+ "grad_norm": 10.9375,
984
+ "learning_rate": 9.790000000000001e-05,
985
+ "loss": 1.0113,
986
+ "step": 1190
987
+ },
988
+ {
989
+ "epoch": 1.4943960149439601,
990
+ "grad_norm": 14.3125,
991
+ "learning_rate": 9.778888888888889e-05,
992
+ "loss": 0.9511,
993
+ "step": 1200
994
+ },
995
+ {
996
+ "epoch": 1.4943960149439601,
997
+ "eval/acc": 37.20930099487305,
998
+ "step": 1200
999
+ },
1000
+ {
1001
+ "epoch": 1.4943960149439601,
1002
+ "eval_loss": 2.701927423477173,
1003
+ "eval_runtime": 0.2099,
1004
+ "eval_samples_per_second": 204.857,
1005
+ "eval_steps_per_second": 4.764,
1006
+ "step": 1200
1007
+ },
1008
+ {
1009
+ "epoch": 1.5068493150684932,
1010
+ "grad_norm": 11.9375,
1011
+ "learning_rate": 9.767777777777778e-05,
1012
+ "loss": 1.0408,
1013
+ "step": 1210
1014
+ },
1015
+ {
1016
+ "epoch": 1.519302615193026,
1017
+ "grad_norm": 7.71875,
1018
+ "learning_rate": 9.756666666666668e-05,
1019
+ "loss": 0.9782,
1020
+ "step": 1220
1021
+ },
1022
+ {
1023
+ "epoch": 1.531755915317559,
1024
+ "grad_norm": 7.5,
1025
+ "learning_rate": 9.745555555555556e-05,
1026
+ "loss": 1.0293,
1027
+ "step": 1230
1028
+ },
1029
+ {
1030
+ "epoch": 1.544209215442092,
1031
+ "grad_norm": 9.6875,
1032
+ "learning_rate": 9.734444444444444e-05,
1033
+ "loss": 0.9718,
1034
+ "step": 1240
1035
+ },
1036
+ {
1037
+ "epoch": 1.5566625155666252,
1038
+ "grad_norm": 11.0,
1039
+ "learning_rate": 9.723333333333334e-05,
1040
+ "loss": 1.0542,
1041
+ "step": 1250
1042
+ },
1043
+ {
1044
+ "epoch": 1.5691158156911582,
1045
+ "grad_norm": 10.5,
1046
+ "learning_rate": 9.712222222222223e-05,
1047
+ "loss": 0.9537,
1048
+ "step": 1260
1049
+ },
1050
+ {
1051
+ "epoch": 1.5815691158156913,
1052
+ "grad_norm": 13.1875,
1053
+ "learning_rate": 9.701111111111111e-05,
1054
+ "loss": 0.9756,
1055
+ "step": 1270
1056
+ },
1057
+ {
1058
+ "epoch": 1.5940224159402243,
1059
+ "grad_norm": 9.9375,
1060
+ "learning_rate": 9.69e-05,
1061
+ "loss": 0.8843,
1062
+ "step": 1280
1063
+ },
1064
+ {
1065
+ "epoch": 1.6064757160647571,
1066
+ "grad_norm": 10.0,
1067
+ "learning_rate": 9.67888888888889e-05,
1068
+ "loss": 0.8808,
1069
+ "step": 1290
1070
+ },
1071
+ {
1072
+ "epoch": 1.6189290161892902,
1073
+ "grad_norm": 14.0,
1074
+ "learning_rate": 9.667777777777778e-05,
1075
+ "loss": 0.9589,
1076
+ "step": 1300
1077
+ },
1078
+ {
1079
+ "epoch": 1.6189290161892902,
1080
+ "eval/acc": 39.53488540649414,
1081
+ "step": 1300
1082
+ },
1083
+ {
1084
+ "epoch": 1.6189290161892902,
1085
+ "eval_loss": 2.7926037311553955,
1086
+ "eval_runtime": 0.2238,
1087
+ "eval_samples_per_second": 192.128,
1088
+ "eval_steps_per_second": 4.468,
1089
+ "step": 1300
1090
+ },
1091
+ {
1092
+ "epoch": 1.631382316313823,
1093
+ "grad_norm": 8.9375,
1094
+ "learning_rate": 9.656666666666668e-05,
1095
+ "loss": 0.9315,
1096
+ "step": 1310
1097
+ },
1098
+ {
1099
+ "epoch": 1.643835616438356,
1100
+ "grad_norm": 10.0625,
1101
+ "learning_rate": 9.645555555555556e-05,
1102
+ "loss": 0.9295,
1103
+ "step": 1320
1104
+ },
1105
+ {
1106
+ "epoch": 1.6562889165628891,
1107
+ "grad_norm": 8.75,
1108
+ "learning_rate": 9.634444444444445e-05,
1109
+ "loss": 0.9255,
1110
+ "step": 1330
1111
+ },
1112
+ {
1113
+ "epoch": 1.6687422166874222,
1114
+ "grad_norm": 11.0625,
1115
+ "learning_rate": 9.623333333333335e-05,
1116
+ "loss": 0.9121,
1117
+ "step": 1340
1118
+ },
1119
+ {
1120
+ "epoch": 1.6811955168119552,
1121
+ "grad_norm": 11.375,
1122
+ "learning_rate": 9.612222222222223e-05,
1123
+ "loss": 0.9232,
1124
+ "step": 1350
1125
+ },
1126
+ {
1127
+ "epoch": 1.6936488169364883,
1128
+ "grad_norm": 11.875,
1129
+ "learning_rate": 9.601111111111112e-05,
1130
+ "loss": 0.8991,
1131
+ "step": 1360
1132
+ },
1133
+ {
1134
+ "epoch": 1.7061021170610213,
1135
+ "grad_norm": 9.0,
1136
+ "learning_rate": 9.59e-05,
1137
+ "loss": 0.9405,
1138
+ "step": 1370
1139
+ },
1140
+ {
1141
+ "epoch": 1.7185554171855542,
1142
+ "grad_norm": 11.875,
1143
+ "learning_rate": 9.57888888888889e-05,
1144
+ "loss": 1.0191,
1145
+ "step": 1380
1146
+ },
1147
+ {
1148
+ "epoch": 1.7310087173100872,
1149
+ "grad_norm": 9.8125,
1150
+ "learning_rate": 9.567777777777778e-05,
1151
+ "loss": 0.9002,
1152
+ "step": 1390
1153
+ },
1154
+ {
1155
+ "epoch": 1.74346201743462,
1156
+ "grad_norm": 12.375,
1157
+ "learning_rate": 9.556666666666667e-05,
1158
+ "loss": 0.9681,
1159
+ "step": 1400
1160
+ },
1161
+ {
1162
+ "epoch": 1.74346201743462,
1163
+ "eval/acc": 39.53488540649414,
1164
+ "step": 1400
1165
+ },
1166
+ {
1167
+ "epoch": 1.74346201743462,
1168
+ "eval_loss": 2.795476198196411,
1169
+ "eval_runtime": 0.2152,
1170
+ "eval_samples_per_second": 199.833,
1171
+ "eval_steps_per_second": 4.647,
1172
+ "step": 1400
1173
+ },
1174
+ {
1175
+ "epoch": 1.755915317559153,
1176
+ "grad_norm": 9.375,
1177
+ "learning_rate": 9.545555555555557e-05,
1178
+ "loss": 1.0222,
1179
+ "step": 1410
1180
+ },
1181
+ {
1182
+ "epoch": 1.7683686176836861,
1183
+ "grad_norm": 9.5625,
1184
+ "learning_rate": 9.534444444444445e-05,
1185
+ "loss": 0.9005,
1186
+ "step": 1420
1187
+ },
1188
+ {
1189
+ "epoch": 1.7808219178082192,
1190
+ "grad_norm": 9.875,
1191
+ "learning_rate": 9.523333333333334e-05,
1192
+ "loss": 0.9616,
1193
+ "step": 1430
1194
+ },
1195
+ {
1196
+ "epoch": 1.7932752179327522,
1197
+ "grad_norm": 28.0,
1198
+ "learning_rate": 9.512222222222222e-05,
1199
+ "loss": 1.0197,
1200
+ "step": 1440
1201
+ },
1202
+ {
1203
+ "epoch": 1.8057285180572853,
1204
+ "grad_norm": 10.75,
1205
+ "learning_rate": 9.501111111111112e-05,
1206
+ "loss": 0.9947,
1207
+ "step": 1450
1208
+ },
1209
+ {
1210
+ "epoch": 1.8181818181818183,
1211
+ "grad_norm": 10.125,
1212
+ "learning_rate": 9.49e-05,
1213
+ "loss": 0.9064,
1214
+ "step": 1460
1215
+ },
1216
+ {
1217
+ "epoch": 1.8306351183063512,
1218
+ "grad_norm": 11.75,
1219
+ "learning_rate": 9.478888888888889e-05,
1220
+ "loss": 0.9425,
1221
+ "step": 1470
1222
+ },
1223
+ {
1224
+ "epoch": 1.8430884184308842,
1225
+ "grad_norm": 10.625,
1226
+ "learning_rate": 9.467777777777779e-05,
1227
+ "loss": 1.0284,
1228
+ "step": 1480
1229
+ },
1230
+ {
1231
+ "epoch": 1.855541718555417,
1232
+ "grad_norm": 10.125,
1233
+ "learning_rate": 9.456666666666667e-05,
1234
+ "loss": 0.9175,
1235
+ "step": 1490
1236
+ },
1237
+ {
1238
+ "epoch": 1.86799501867995,
1239
+ "grad_norm": 8.375,
1240
+ "learning_rate": 9.445555555555557e-05,
1241
+ "loss": 0.8608,
1242
+ "step": 1500
1243
+ },
1244
+ {
1245
+ "epoch": 1.86799501867995,
1246
+ "eval/acc": 39.53488540649414,
1247
+ "step": 1500
1248
+ },
1249
+ {
1250
+ "epoch": 1.86799501867995,
1251
+ "eval_loss": 2.8291714191436768,
1252
+ "eval_runtime": 0.216,
1253
+ "eval_samples_per_second": 199.031,
1254
+ "eval_steps_per_second": 4.629,
1255
+ "step": 1500
1256
+ },
1257
+ {
1258
+ "epoch": 1.8804483188044832,
1259
+ "grad_norm": 9.625,
1260
+ "learning_rate": 9.434444444444444e-05,
1261
+ "loss": 0.9695,
1262
+ "step": 1510
1263
+ },
1264
+ {
1265
+ "epoch": 1.8929016189290162,
1266
+ "grad_norm": 9.8125,
1267
+ "learning_rate": 9.423333333333334e-05,
1268
+ "loss": 0.9924,
1269
+ "step": 1520
1270
+ },
1271
+ {
1272
+ "epoch": 1.9053549190535493,
1273
+ "grad_norm": 10.0,
1274
+ "learning_rate": 9.412222222222222e-05,
1275
+ "loss": 1.0733,
1276
+ "step": 1530
1277
+ },
1278
+ {
1279
+ "epoch": 1.9178082191780823,
1280
+ "grad_norm": 10.25,
1281
+ "learning_rate": 9.401111111111112e-05,
1282
+ "loss": 0.8818,
1283
+ "step": 1540
1284
+ },
1285
+ {
1286
+ "epoch": 1.9302615193026154,
1287
+ "grad_norm": 15.3125,
1288
+ "learning_rate": 9.39e-05,
1289
+ "loss": 0.9053,
1290
+ "step": 1550
1291
+ },
1292
+ {
1293
+ "epoch": 1.9427148194271482,
1294
+ "grad_norm": 8.25,
1295
+ "learning_rate": 9.378888888888889e-05,
1296
+ "loss": 0.8586,
1297
+ "step": 1560
1298
+ },
1299
+ {
1300
+ "epoch": 1.9551681195516812,
1301
+ "grad_norm": 17.5,
1302
+ "learning_rate": 9.367777777777779e-05,
1303
+ "loss": 0.9316,
1304
+ "step": 1570
1305
+ },
1306
+ {
1307
+ "epoch": 1.967621419676214,
1308
+ "grad_norm": 10.875,
1309
+ "learning_rate": 9.356666666666667e-05,
1310
+ "loss": 1.0195,
1311
+ "step": 1580
1312
+ },
1313
+ {
1314
+ "epoch": 1.9800747198007471,
1315
+ "grad_norm": 9.1875,
1316
+ "learning_rate": 9.345555555555556e-05,
1317
+ "loss": 0.8878,
1318
+ "step": 1590
1319
+ },
1320
+ {
1321
+ "epoch": 1.9925280199252802,
1322
+ "grad_norm": 10.3125,
1323
+ "learning_rate": 9.334444444444444e-05,
1324
+ "loss": 0.9765,
1325
+ "step": 1600
1326
+ },
1327
+ {
1328
+ "epoch": 1.9925280199252802,
1329
+ "eval/acc": 37.20930099487305,
1330
+ "step": 1600
1331
+ },
1332
+ {
1333
+ "epoch": 1.9925280199252802,
1334
+ "eval_loss": 2.9084553718566895,
1335
+ "eval_runtime": 0.2099,
1336
+ "eval_samples_per_second": 204.856,
1337
+ "eval_steps_per_second": 4.764,
1338
+ "step": 1600
1339
+ },
1340
+ {
1341
+ "epoch": 2.004981320049813,
1342
+ "grad_norm": 8.6875,
1343
+ "learning_rate": 9.323333333333334e-05,
1344
+ "loss": 0.8596,
1345
+ "step": 1610
1346
+ },
1347
+ {
1348
+ "epoch": 2.0174346201743463,
1349
+ "grad_norm": 12.0625,
1350
+ "learning_rate": 9.312222222222223e-05,
1351
+ "loss": 0.9156,
1352
+ "step": 1620
1353
+ },
1354
+ {
1355
+ "epoch": 2.0298879202988793,
1356
+ "grad_norm": 10.1875,
1357
+ "learning_rate": 9.301111111111111e-05,
1358
+ "loss": 0.8404,
1359
+ "step": 1630
1360
+ },
1361
+ {
1362
+ "epoch": 2.0423412204234124,
1363
+ "grad_norm": 10.125,
1364
+ "learning_rate": 9.290000000000001e-05,
1365
+ "loss": 0.8111,
1366
+ "step": 1640
1367
+ },
1368
+ {
1369
+ "epoch": 2.0547945205479454,
1370
+ "grad_norm": 10.625,
1371
+ "learning_rate": 9.278888888888889e-05,
1372
+ "loss": 0.8124,
1373
+ "step": 1650
1374
+ },
1375
+ {
1376
+ "epoch": 2.067247820672478,
1377
+ "grad_norm": 10.0,
1378
+ "learning_rate": 9.267777777777779e-05,
1379
+ "loss": 0.8124,
1380
+ "step": 1660
1381
+ },
1382
+ {
1383
+ "epoch": 2.079701120797011,
1384
+ "grad_norm": 10.75,
1385
+ "learning_rate": 9.256666666666666e-05,
1386
+ "loss": 0.8384,
1387
+ "step": 1670
1388
+ },
1389
+ {
1390
+ "epoch": 2.092154420921544,
1391
+ "grad_norm": 8.3125,
1392
+ "learning_rate": 9.245555555555556e-05,
1393
+ "loss": 0.8734,
1394
+ "step": 1680
1395
+ },
1396
+ {
1397
+ "epoch": 2.104607721046077,
1398
+ "grad_norm": 8.6875,
1399
+ "learning_rate": 9.234444444444445e-05,
1400
+ "loss": 0.7674,
1401
+ "step": 1690
1402
+ },
1403
+ {
1404
+ "epoch": 2.1170610211706102,
1405
+ "grad_norm": 6.6875,
1406
+ "learning_rate": 9.223333333333334e-05,
1407
+ "loss": 0.8514,
1408
+ "step": 1700
1409
+ },
1410
+ {
1411
+ "epoch": 2.1170610211706102,
1412
+ "eval/acc": 48.83720779418945,
1413
+ "step": 1700
1414
+ },
1415
+ {
1416
+ "epoch": 2.1170610211706102,
1417
+ "eval_loss": 1.9776915311813354,
1418
+ "eval_runtime": 1.2116,
1419
+ "eval_samples_per_second": 35.491,
1420
+ "eval_steps_per_second": 0.825,
1421
+ "step": 1700
1422
+ },
1423
+ {
1424
+ "epoch": 2.1295143212951433,
1425
+ "grad_norm": 11.3125,
1426
+ "learning_rate": 9.212222222222223e-05,
1427
+ "loss": 0.8502,
1428
+ "step": 1710
1429
+ },
1430
+ {
1431
+ "epoch": 2.1419676214196763,
1432
+ "grad_norm": 12.0625,
1433
+ "learning_rate": 9.201111111111111e-05,
1434
+ "loss": 0.9026,
1435
+ "step": 1720
1436
+ },
1437
+ {
1438
+ "epoch": 2.1544209215442094,
1439
+ "grad_norm": 8.3125,
1440
+ "learning_rate": 9.190000000000001e-05,
1441
+ "loss": 0.7893,
1442
+ "step": 1730
1443
+ },
1444
+ {
1445
+ "epoch": 2.166874221668742,
1446
+ "grad_norm": 14.5625,
1447
+ "learning_rate": 9.17888888888889e-05,
1448
+ "loss": 0.7671,
1449
+ "step": 1740
1450
+ },
1451
+ {
1452
+ "epoch": 2.179327521793275,
1453
+ "grad_norm": 11.25,
1454
+ "learning_rate": 9.167777777777778e-05,
1455
+ "loss": 0.7869,
1456
+ "step": 1750
1457
+ },
1458
+ {
1459
+ "epoch": 2.191780821917808,
1460
+ "grad_norm": 8.625,
1461
+ "learning_rate": 9.156666666666667e-05,
1462
+ "loss": 0.8251,
1463
+ "step": 1760
1464
+ },
1465
+ {
1466
+ "epoch": 2.204234122042341,
1467
+ "grad_norm": 7.8125,
1468
+ "learning_rate": 9.145555555555556e-05,
1469
+ "loss": 0.7838,
1470
+ "step": 1770
1471
+ },
1472
+ {
1473
+ "epoch": 2.216687422166874,
1474
+ "grad_norm": 11.6875,
1475
+ "learning_rate": 9.134444444444445e-05,
1476
+ "loss": 0.8348,
1477
+ "step": 1780
1478
+ },
1479
+ {
1480
+ "epoch": 2.2291407222914073,
1481
+ "grad_norm": 9.75,
1482
+ "learning_rate": 9.123333333333333e-05,
1483
+ "loss": 0.8322,
1484
+ "step": 1790
1485
+ },
1486
+ {
1487
+ "epoch": 2.2415940224159403,
1488
+ "grad_norm": 9.25,
1489
+ "learning_rate": 9.112222222222223e-05,
1490
+ "loss": 0.8514,
1491
+ "step": 1800
1492
+ },
1493
+ {
1494
+ "epoch": 2.2415940224159403,
1495
+ "eval/acc": 48.83720779418945,
1496
+ "step": 1800
1497
+ },
1498
+ {
1499
+ "epoch": 2.2415940224159403,
1500
+ "eval_loss": 1.968414306640625,
1501
+ "eval_runtime": 0.2153,
1502
+ "eval_samples_per_second": 199.733,
1503
+ "eval_steps_per_second": 4.645,
1504
+ "step": 1800
1505
+ },
1506
+ {
1507
+ "epoch": 2.2540473225404734,
1508
+ "grad_norm": 12.6875,
1509
+ "learning_rate": 9.101111111111112e-05,
1510
+ "loss": 0.7841,
1511
+ "step": 1810
1512
+ },
1513
+ {
1514
+ "epoch": 2.2665006226650064,
1515
+ "grad_norm": 7.09375,
1516
+ "learning_rate": 9.090000000000001e-05,
1517
+ "loss": 0.7889,
1518
+ "step": 1820
1519
+ },
1520
+ {
1521
+ "epoch": 2.2789539227895395,
1522
+ "grad_norm": 8.1875,
1523
+ "learning_rate": 9.078888888888889e-05,
1524
+ "loss": 0.8088,
1525
+ "step": 1830
1526
+ },
1527
+ {
1528
+ "epoch": 2.291407222914072,
1529
+ "grad_norm": 12.3125,
1530
+ "learning_rate": 9.067777777777778e-05,
1531
+ "loss": 0.8247,
1532
+ "step": 1840
1533
+ },
1534
+ {
1535
+ "epoch": 2.303860523038605,
1536
+ "grad_norm": 7.40625,
1537
+ "learning_rate": 9.056666666666667e-05,
1538
+ "loss": 0.7383,
1539
+ "step": 1850
1540
+ },
1541
+ {
1542
+ "epoch": 2.316313823163138,
1543
+ "grad_norm": 8.5,
1544
+ "learning_rate": 9.045555555555557e-05,
1545
+ "loss": 0.8074,
1546
+ "step": 1860
1547
+ },
1548
+ {
1549
+ "epoch": 2.328767123287671,
1550
+ "grad_norm": 8.625,
1551
+ "learning_rate": 9.034444444444445e-05,
1552
+ "loss": 0.7866,
1553
+ "step": 1870
1554
+ },
1555
+ {
1556
+ "epoch": 2.3412204234122043,
1557
+ "grad_norm": 10.1875,
1558
+ "learning_rate": 9.023333333333334e-05,
1559
+ "loss": 0.8159,
1560
+ "step": 1880
1561
+ },
1562
+ {
1563
+ "epoch": 2.3536737235367373,
1564
+ "grad_norm": 9.875,
1565
+ "learning_rate": 9.012222222222223e-05,
1566
+ "loss": 0.831,
1567
+ "step": 1890
1568
+ },
1569
+ {
1570
+ "epoch": 2.3661270236612704,
1571
+ "grad_norm": 11.0,
1572
+ "learning_rate": 9.001111111111112e-05,
1573
+ "loss": 0.7215,
1574
+ "step": 1900
1575
+ },
1576
+ {
1577
+ "epoch": 2.3661270236612704,
1578
+ "eval/acc": 48.83720779418945,
1579
+ "step": 1900
1580
+ },
1581
+ {
1582
+ "epoch": 2.3661270236612704,
1583
+ "eval_loss": 1.9242758750915527,
1584
+ "eval_runtime": 0.2274,
1585
+ "eval_samples_per_second": 189.058,
1586
+ "eval_steps_per_second": 4.397,
1587
+ "step": 1900
1588
+ },
1589
+ {
1590
+ "epoch": 2.3785803237858034,
1591
+ "grad_norm": 9.8125,
1592
+ "learning_rate": 8.99e-05,
1593
+ "loss": 0.8346,
1594
+ "step": 1910
1595
+ },
1596
+ {
1597
+ "epoch": 2.391033623910336,
1598
+ "grad_norm": 9.9375,
1599
+ "learning_rate": 8.978888888888889e-05,
1600
+ "loss": 0.8141,
1601
+ "step": 1920
1602
+ },
1603
+ {
1604
+ "epoch": 2.403486924034869,
1605
+ "grad_norm": 10.3125,
1606
+ "learning_rate": 8.967777777777779e-05,
1607
+ "loss": 0.911,
1608
+ "step": 1930
1609
+ },
1610
+ {
1611
+ "epoch": 2.415940224159402,
1612
+ "grad_norm": 9.75,
1613
+ "learning_rate": 8.956666666666667e-05,
1614
+ "loss": 0.9486,
1615
+ "step": 1940
1616
+ },
1617
+ {
1618
+ "epoch": 2.428393524283935,
1619
+ "grad_norm": 9.25,
1620
+ "learning_rate": 8.945555555555556e-05,
1621
+ "loss": 0.8775,
1622
+ "step": 1950
1623
+ },
1624
+ {
1625
+ "epoch": 2.4408468244084682,
1626
+ "grad_norm": 8.0,
1627
+ "learning_rate": 8.934444444444445e-05,
1628
+ "loss": 0.8373,
1629
+ "step": 1960
1630
+ },
1631
+ {
1632
+ "epoch": 2.4533001245330013,
1633
+ "grad_norm": 7.625,
1634
+ "learning_rate": 8.923333333333334e-05,
1635
+ "loss": 0.7469,
1636
+ "step": 1970
1637
+ },
1638
+ {
1639
+ "epoch": 2.4657534246575343,
1640
+ "grad_norm": 37.75,
1641
+ "learning_rate": 8.912222222222222e-05,
1642
+ "loss": 0.7934,
1643
+ "step": 1980
1644
+ },
1645
+ {
1646
+ "epoch": 2.4782067247820674,
1647
+ "grad_norm": 9.125,
1648
+ "learning_rate": 8.901111111111111e-05,
1649
+ "loss": 0.7733,
1650
+ "step": 1990
1651
+ },
1652
+ {
1653
+ "epoch": 2.4906600249066004,
1654
+ "grad_norm": 8.8125,
1655
+ "learning_rate": 8.89e-05,
1656
+ "loss": 0.7488,
1657
+ "step": 2000
1658
+ },
1659
+ {
1660
+ "epoch": 2.4906600249066004,
1661
+ "eval/acc": 48.83720779418945,
1662
+ "step": 2000
1663
+ },
1664
+ {
1665
+ "epoch": 2.4906600249066004,
1666
+ "eval_loss": 1.8490980863571167,
1667
+ "eval_runtime": 0.2184,
1668
+ "eval_samples_per_second": 196.883,
1669
+ "eval_steps_per_second": 4.579,
1670
+ "step": 2000
1671
+ },
1672
+ {
1673
+ "epoch": 2.5031133250311335,
1674
+ "grad_norm": 8.0,
1675
+ "learning_rate": 8.878888888888889e-05,
1676
+ "loss": 0.8461,
1677
+ "step": 2010
1678
+ },
1679
+ {
1680
+ "epoch": 2.515566625155666,
1681
+ "grad_norm": 8.75,
1682
+ "learning_rate": 8.867777777777778e-05,
1683
+ "loss": 0.7647,
1684
+ "step": 2020
1685
+ },
1686
+ {
1687
+ "epoch": 2.528019925280199,
1688
+ "grad_norm": 8.8125,
1689
+ "learning_rate": 8.856666666666667e-05,
1690
+ "loss": 0.796,
1691
+ "step": 2030
1692
+ },
1693
+ {
1694
+ "epoch": 2.540473225404732,
1695
+ "grad_norm": 7.78125,
1696
+ "learning_rate": 8.845555555555556e-05,
1697
+ "loss": 0.7758,
1698
+ "step": 2040
1699
+ },
1700
+ {
1701
+ "epoch": 2.5529265255292652,
1702
+ "grad_norm": 7.75,
1703
+ "learning_rate": 8.834444444444446e-05,
1704
+ "loss": 0.7753,
1705
+ "step": 2050
1706
+ },
1707
+ {
1708
+ "epoch": 2.5653798256537983,
1709
+ "grad_norm": 8.9375,
1710
+ "learning_rate": 8.823333333333334e-05,
1711
+ "loss": 0.6914,
1712
+ "step": 2060
1713
+ },
1714
+ {
1715
+ "epoch": 2.5778331257783313,
1716
+ "grad_norm": 9.4375,
1717
+ "learning_rate": 8.812222222222223e-05,
1718
+ "loss": 0.787,
1719
+ "step": 2070
1720
+ },
1721
+ {
1722
+ "epoch": 2.5902864259028644,
1723
+ "grad_norm": 8.125,
1724
+ "learning_rate": 8.801111111111111e-05,
1725
+ "loss": 0.7742,
1726
+ "step": 2080
1727
+ },
1728
+ {
1729
+ "epoch": 2.602739726027397,
1730
+ "grad_norm": 10.6875,
1731
+ "learning_rate": 8.790000000000001e-05,
1732
+ "loss": 0.7528,
1733
+ "step": 2090
1734
+ },
1735
+ {
1736
+ "epoch": 2.61519302615193,
1737
+ "grad_norm": 9.1875,
1738
+ "learning_rate": 8.77888888888889e-05,
1739
+ "loss": 0.7392,
1740
+ "step": 2100
1741
+ },
1742
+ {
1743
+ "epoch": 2.61519302615193,
1744
+ "eval/acc": 46.511627197265625,
1745
+ "step": 2100
1746
+ },
1747
+ {
1748
+ "epoch": 2.61519302615193,
1749
+ "eval_loss": 1.9725399017333984,
1750
+ "eval_runtime": 0.214,
1751
+ "eval_samples_per_second": 200.925,
1752
+ "eval_steps_per_second": 4.673,
1753
+ "step": 2100
1754
+ },
1755
+ {
1756
+ "epoch": 2.627646326276463,
1757
+ "grad_norm": 9.375,
1758
+ "learning_rate": 8.767777777777778e-05,
1759
+ "loss": 0.7993,
1760
+ "step": 2110
1761
+ },
1762
+ {
1763
+ "epoch": 2.640099626400996,
1764
+ "grad_norm": 9.0625,
1765
+ "learning_rate": 8.756666666666668e-05,
1766
+ "loss": 0.854,
1767
+ "step": 2120
1768
+ },
1769
+ {
1770
+ "epoch": 2.652552926525529,
1771
+ "grad_norm": 10.625,
1772
+ "learning_rate": 8.745555555555556e-05,
1773
+ "loss": 0.8887,
1774
+ "step": 2130
1775
+ },
1776
+ {
1777
+ "epoch": 2.6650062266500623,
1778
+ "grad_norm": 7.75,
1779
+ "learning_rate": 8.734444444444445e-05,
1780
+ "loss": 0.7407,
1781
+ "step": 2140
1782
+ },
1783
+ {
1784
+ "epoch": 2.6774595267745953,
1785
+ "grad_norm": 10.75,
1786
+ "learning_rate": 8.723333333333333e-05,
1787
+ "loss": 0.9187,
1788
+ "step": 2150
1789
+ },
1790
+ {
1791
+ "epoch": 2.6899128268991284,
1792
+ "grad_norm": 7.71875,
1793
+ "learning_rate": 8.712222222222223e-05,
1794
+ "loss": 0.7804,
1795
+ "step": 2160
1796
+ },
1797
+ {
1798
+ "epoch": 2.7023661270236614,
1799
+ "grad_norm": 7.34375,
1800
+ "learning_rate": 8.701111111111111e-05,
1801
+ "loss": 0.7368,
1802
+ "step": 2170
1803
+ },
1804
+ {
1805
+ "epoch": 2.7148194271481945,
1806
+ "grad_norm": 10.0625,
1807
+ "learning_rate": 8.69e-05,
1808
+ "loss": 0.7027,
1809
+ "step": 2180
1810
+ },
1811
+ {
1812
+ "epoch": 2.7272727272727275,
1813
+ "grad_norm": 12.875,
1814
+ "learning_rate": 8.67888888888889e-05,
1815
+ "loss": 0.8305,
1816
+ "step": 2190
1817
+ },
1818
+ {
1819
+ "epoch": 2.73972602739726,
1820
+ "grad_norm": 9.125,
1821
+ "learning_rate": 8.667777777777778e-05,
1822
+ "loss": 0.7767,
1823
+ "step": 2200
1824
+ },
1825
+ {
1826
+ "epoch": 2.73972602739726,
1827
+ "eval/acc": 48.83720779418945,
1828
+ "step": 2200
1829
+ },
1830
+ {
1831
+ "epoch": 2.73972602739726,
1832
+ "eval_loss": 1.8356798887252808,
1833
+ "eval_runtime": 0.2116,
1834
+ "eval_samples_per_second": 203.247,
1835
+ "eval_steps_per_second": 4.727,
1836
+ "step": 2200
1837
+ },
1838
+ {
1839
+ "epoch": 2.752179327521793,
1840
+ "grad_norm": 8.4375,
1841
+ "learning_rate": 8.656666666666668e-05,
1842
+ "loss": 0.7547,
1843
+ "step": 2210
1844
+ },
1845
+ {
1846
+ "epoch": 2.7646326276463262,
1847
+ "grad_norm": 7.5,
1848
+ "learning_rate": 8.645555555555555e-05,
1849
+ "loss": 0.8497,
1850
+ "step": 2220
1851
+ },
1852
+ {
1853
+ "epoch": 2.7770859277708593,
1854
+ "grad_norm": 10.0625,
1855
+ "learning_rate": 8.634444444444445e-05,
1856
+ "loss": 0.8024,
1857
+ "step": 2230
1858
+ },
1859
+ {
1860
+ "epoch": 2.7895392278953923,
1861
+ "grad_norm": 13.5,
1862
+ "learning_rate": 8.623333333333333e-05,
1863
+ "loss": 0.7806,
1864
+ "step": 2240
1865
+ },
1866
+ {
1867
+ "epoch": 2.8019925280199254,
1868
+ "grad_norm": 10.8125,
1869
+ "learning_rate": 8.612222222222223e-05,
1870
+ "loss": 0.7021,
1871
+ "step": 2250
1872
+ },
1873
+ {
1874
+ "epoch": 2.8144458281444584,
1875
+ "grad_norm": 9.3125,
1876
+ "learning_rate": 8.601111111111112e-05,
1877
+ "loss": 0.72,
1878
+ "step": 2260
1879
+ },
1880
+ {
1881
+ "epoch": 2.826899128268991,
1882
+ "grad_norm": 8.875,
1883
+ "learning_rate": 8.59e-05,
1884
+ "loss": 0.8063,
1885
+ "step": 2270
1886
+ },
1887
+ {
1888
+ "epoch": 2.839352428393524,
1889
+ "grad_norm": 8.75,
1890
+ "learning_rate": 8.57888888888889e-05,
1891
+ "loss": 0.8264,
1892
+ "step": 2280
1893
+ },
1894
+ {
1895
+ "epoch": 2.851805728518057,
1896
+ "grad_norm": 8.75,
1897
+ "learning_rate": 8.567777777777778e-05,
1898
+ "loss": 0.814,
1899
+ "step": 2290
1900
+ },
1901
+ {
1902
+ "epoch": 2.86425902864259,
1903
+ "grad_norm": 10.25,
1904
+ "learning_rate": 8.556666666666667e-05,
1905
+ "loss": 0.7985,
1906
+ "step": 2300
1907
+ },
1908
+ {
1909
+ "epoch": 2.86425902864259,
1910
+ "eval/acc": 51.16279220581055,
1911
+ "step": 2300
1912
+ },
1913
+ {
1914
+ "epoch": 2.86425902864259,
1915
+ "eval_loss": 1.9056586027145386,
1916
+ "eval_runtime": 0.221,
1917
+ "eval_samples_per_second": 194.606,
1918
+ "eval_steps_per_second": 4.526,
1919
+ "step": 2300
1920
+ },
1921
+ {
1922
+ "epoch": 2.8767123287671232,
1923
+ "grad_norm": 8.6875,
1924
+ "learning_rate": 8.545555555555555e-05,
1925
+ "loss": 0.7489,
1926
+ "step": 2310
1927
+ },
1928
+ {
1929
+ "epoch": 2.8891656288916563,
1930
+ "grad_norm": 9.25,
1931
+ "learning_rate": 8.534444444444445e-05,
1932
+ "loss": 0.8398,
1933
+ "step": 2320
1934
+ },
1935
+ {
1936
+ "epoch": 2.9016189290161893,
1937
+ "grad_norm": 8.8125,
1938
+ "learning_rate": 8.523333333333334e-05,
1939
+ "loss": 0.7808,
1940
+ "step": 2330
1941
+ },
1942
+ {
1943
+ "epoch": 2.9140722291407224,
1944
+ "grad_norm": 8.625,
1945
+ "learning_rate": 8.512222222222222e-05,
1946
+ "loss": 0.8163,
1947
+ "step": 2340
1948
+ },
1949
+ {
1950
+ "epoch": 2.9265255292652554,
1951
+ "grad_norm": 13.9375,
1952
+ "learning_rate": 8.501111111111112e-05,
1953
+ "loss": 0.8038,
1954
+ "step": 2350
1955
+ },
1956
+ {
1957
+ "epoch": 2.9389788293897885,
1958
+ "grad_norm": 11.8125,
1959
+ "learning_rate": 8.49e-05,
1960
+ "loss": 0.7362,
1961
+ "step": 2360
1962
+ },
1963
+ {
1964
+ "epoch": 2.9514321295143215,
1965
+ "grad_norm": 12.0625,
1966
+ "learning_rate": 8.47888888888889e-05,
1967
+ "loss": 0.8096,
1968
+ "step": 2370
1969
+ },
1970
+ {
1971
+ "epoch": 2.963885429638854,
1972
+ "grad_norm": 10.4375,
1973
+ "learning_rate": 8.467777777777777e-05,
1974
+ "loss": 0.7728,
1975
+ "step": 2380
1976
+ },
1977
+ {
1978
+ "epoch": 2.976338729763387,
1979
+ "grad_norm": 11.875,
1980
+ "learning_rate": 8.456666666666667e-05,
1981
+ "loss": 0.8224,
1982
+ "step": 2390
1983
+ },
1984
+ {
1985
+ "epoch": 2.9887920298879203,
1986
+ "grad_norm": 8.375,
1987
+ "learning_rate": 8.445555555555556e-05,
1988
+ "loss": 0.8418,
1989
+ "step": 2400
1990
+ },
1991
+ {
1992
+ "epoch": 2.9887920298879203,
1993
+ "eval/acc": 46.511627197265625,
1994
+ "step": 2400
1995
+ },
1996
+ {
1997
+ "epoch": 2.9887920298879203,
1998
+ "eval_loss": 1.9594019651412964,
1999
+ "eval_runtime": 0.2169,
2000
+ "eval_samples_per_second": 198.209,
2001
+ "eval_steps_per_second": 4.61,
2002
+ "step": 2400
2003
+ },
2004
+ {
2005
+ "epoch": 3.0012453300124533,
2006
+ "grad_norm": 6.78125,
2007
+ "learning_rate": 8.434444444444445e-05,
2008
+ "loss": 0.7483,
2009
+ "step": 2410
2010
+ },
2011
+ {
2012
+ "epoch": 3.0136986301369864,
2013
+ "grad_norm": 8.75,
2014
+ "learning_rate": 8.423333333333334e-05,
2015
+ "loss": 0.7396,
2016
+ "step": 2420
2017
+ },
2018
+ {
2019
+ "epoch": 3.0261519302615194,
2020
+ "grad_norm": 15.6875,
2021
+ "learning_rate": 8.412222222222222e-05,
2022
+ "loss": 0.7436,
2023
+ "step": 2430
2024
+ },
2025
+ {
2026
+ "epoch": 3.0386052303860525,
2027
+ "grad_norm": 8.5625,
2028
+ "learning_rate": 8.401111111111112e-05,
2029
+ "loss": 0.6092,
2030
+ "step": 2440
2031
+ },
2032
+ {
2033
+ "epoch": 3.0510585305105855,
2034
+ "grad_norm": 11.0,
2035
+ "learning_rate": 8.39e-05,
2036
+ "loss": 0.7142,
2037
+ "step": 2450
2038
+ },
2039
+ {
2040
+ "epoch": 3.063511830635118,
2041
+ "grad_norm": 11.8125,
2042
+ "learning_rate": 8.378888888888889e-05,
2043
+ "loss": 0.692,
2044
+ "step": 2460
2045
+ },
2046
+ {
2047
+ "epoch": 3.075965130759651,
2048
+ "grad_norm": 10.8125,
2049
+ "learning_rate": 8.367777777777778e-05,
2050
+ "loss": 0.672,
2051
+ "step": 2470
2052
+ },
2053
+ {
2054
+ "epoch": 3.088418430884184,
2055
+ "grad_norm": 9.0625,
2056
+ "learning_rate": 8.356666666666667e-05,
2057
+ "loss": 0.6947,
2058
+ "step": 2480
2059
+ },
2060
+ {
2061
+ "epoch": 3.1008717310087173,
2062
+ "grad_norm": 9.0625,
2063
+ "learning_rate": 8.345555555555556e-05,
2064
+ "loss": 0.7188,
2065
+ "step": 2490
2066
+ },
2067
+ {
2068
+ "epoch": 3.1133250311332503,
2069
+ "grad_norm": 7.125,
2070
+ "learning_rate": 8.334444444444444e-05,
2071
+ "loss": 0.6621,
2072
+ "step": 2500
2073
+ },
2074
+ {
2075
+ "epoch": 3.1133250311332503,
2076
+ "eval/acc": 41.86046600341797,
2077
+ "step": 2500
2078
+ },
2079
+ {
2080
+ "epoch": 3.1133250311332503,
2081
+ "eval_loss": 2.2108497619628906,
2082
+ "eval_runtime": 2.0608,
2083
+ "eval_samples_per_second": 20.866,
2084
+ "eval_steps_per_second": 0.485,
2085
+ "step": 2500
2086
+ },
2087
+ {
2088
+ "epoch": 3.1257783312577834,
2089
+ "grad_norm": 9.6875,
2090
+ "learning_rate": 8.323333333333334e-05,
2091
+ "loss": 0.7583,
2092
+ "step": 2510
2093
+ },
2094
+ {
2095
+ "epoch": 3.1382316313823164,
2096
+ "grad_norm": 8.4375,
2097
+ "learning_rate": 8.312222222222223e-05,
2098
+ "loss": 0.6909,
2099
+ "step": 2520
2100
+ },
2101
+ {
2102
+ "epoch": 3.1506849315068495,
2103
+ "grad_norm": 10.5625,
2104
+ "learning_rate": 8.301111111111111e-05,
2105
+ "loss": 0.7588,
2106
+ "step": 2530
2107
+ },
2108
+ {
2109
+ "epoch": 3.1631382316313825,
2110
+ "grad_norm": 14.1875,
2111
+ "learning_rate": 8.29e-05,
2112
+ "loss": 0.6369,
2113
+ "step": 2540
2114
+ },
2115
+ {
2116
+ "epoch": 3.175591531755915,
2117
+ "grad_norm": 8.3125,
2118
+ "learning_rate": 8.27888888888889e-05,
2119
+ "loss": 0.689,
2120
+ "step": 2550
2121
+ },
2122
+ {
2123
+ "epoch": 3.188044831880448,
2124
+ "grad_norm": 6.6875,
2125
+ "learning_rate": 8.267777777777778e-05,
2126
+ "loss": 0.6762,
2127
+ "step": 2560
2128
+ },
2129
+ {
2130
+ "epoch": 3.2004981320049812,
2131
+ "grad_norm": 12.3125,
2132
+ "learning_rate": 8.256666666666666e-05,
2133
+ "loss": 0.7338,
2134
+ "step": 2570
2135
+ },
2136
+ {
2137
+ "epoch": 3.2129514321295143,
2138
+ "grad_norm": 7.53125,
2139
+ "learning_rate": 8.245555555555556e-05,
2140
+ "loss": 0.6732,
2141
+ "step": 2580
2142
+ },
2143
+ {
2144
+ "epoch": 3.2254047322540473,
2145
+ "grad_norm": 11.1875,
2146
+ "learning_rate": 8.234444444444445e-05,
2147
+ "loss": 0.7448,
2148
+ "step": 2590
2149
+ },
2150
+ {
2151
+ "epoch": 3.2378580323785804,
2152
+ "grad_norm": 10.25,
2153
+ "learning_rate": 8.223333333333334e-05,
2154
+ "loss": 0.6227,
2155
+ "step": 2600
2156
+ },
2157
+ {
2158
+ "epoch": 3.2378580323785804,
2159
+ "eval/acc": 41.86046600341797,
2160
+ "step": 2600
2161
+ },
2162
+ {
2163
+ "epoch": 3.2378580323785804,
2164
+ "eval_loss": 2.2032454013824463,
2165
+ "eval_runtime": 0.2227,
2166
+ "eval_samples_per_second": 193.082,
2167
+ "eval_steps_per_second": 4.49,
2168
+ "step": 2600
2169
+ },
2170
+ {
2171
+ "epoch": 3.2503113325031134,
2172
+ "grad_norm": 9.6875,
2173
+ "learning_rate": 8.212222222222223e-05,
2174
+ "loss": 0.6085,
2175
+ "step": 2610
2176
+ },
2177
+ {
2178
+ "epoch": 3.2627646326276465,
2179
+ "grad_norm": 7.03125,
2180
+ "learning_rate": 8.201111111111111e-05,
2181
+ "loss": 0.6685,
2182
+ "step": 2620
2183
+ },
2184
+ {
2185
+ "epoch": 3.275217932752179,
2186
+ "grad_norm": 9.5625,
2187
+ "learning_rate": 8.19e-05,
2188
+ "loss": 0.6985,
2189
+ "step": 2630
2190
+ },
2191
+ {
2192
+ "epoch": 3.287671232876712,
2193
+ "grad_norm": 12.625,
2194
+ "learning_rate": 8.17888888888889e-05,
2195
+ "loss": 0.6639,
2196
+ "step": 2640
2197
+ },
2198
+ {
2199
+ "epoch": 3.300124533001245,
2200
+ "grad_norm": 8.6875,
2201
+ "learning_rate": 8.167777777777778e-05,
2202
+ "loss": 0.6424,
2203
+ "step": 2650
2204
+ },
2205
+ {
2206
+ "epoch": 3.3125778331257782,
2207
+ "grad_norm": 9.25,
2208
+ "learning_rate": 8.156666666666667e-05,
2209
+ "loss": 0.6259,
2210
+ "step": 2660
2211
+ },
2212
+ {
2213
+ "epoch": 3.3250311332503113,
2214
+ "grad_norm": 9.5,
2215
+ "learning_rate": 8.145555555555556e-05,
2216
+ "loss": 0.6859,
2217
+ "step": 2670
2218
+ },
2219
+ {
2220
+ "epoch": 3.3374844333748444,
2221
+ "grad_norm": 11.8125,
2222
+ "learning_rate": 8.134444444444445e-05,
2223
+ "loss": 0.7418,
2224
+ "step": 2680
2225
+ },
2226
+ {
2227
+ "epoch": 3.3499377334993774,
2228
+ "grad_norm": 9.875,
2229
+ "learning_rate": 8.123333333333333e-05,
2230
+ "loss": 0.7075,
2231
+ "step": 2690
2232
+ },
2233
+ {
2234
+ "epoch": 3.3623910336239105,
2235
+ "grad_norm": 8.375,
2236
+ "learning_rate": 8.112222222222222e-05,
2237
+ "loss": 0.6807,
2238
+ "step": 2700
2239
+ },
2240
+ {
2241
+ "epoch": 3.3623910336239105,
2242
+ "eval/acc": 41.86046600341797,
2243
+ "step": 2700
2244
+ },
2245
+ {
2246
+ "epoch": 3.3623910336239105,
2247
+ "eval_loss": 2.1954193115234375,
2248
+ "eval_runtime": 0.2168,
2249
+ "eval_samples_per_second": 198.341,
2250
+ "eval_steps_per_second": 4.613,
2251
+ "step": 2700
2252
+ },
2253
+ {
2254
+ "epoch": 3.3748443337484435,
2255
+ "grad_norm": 8.375,
2256
+ "learning_rate": 8.101111111111112e-05,
2257
+ "loss": 0.6836,
2258
+ "step": 2710
2259
+ },
2260
+ {
2261
+ "epoch": 3.3872976338729766,
2262
+ "grad_norm": 7.9375,
2263
+ "learning_rate": 8.090000000000001e-05,
2264
+ "loss": 0.664,
2265
+ "step": 2720
2266
+ },
2267
+ {
2268
+ "epoch": 3.399750933997509,
2269
+ "grad_norm": 12.25,
2270
+ "learning_rate": 8.078888888888889e-05,
2271
+ "loss": 0.7048,
2272
+ "step": 2730
2273
+ },
2274
+ {
2275
+ "epoch": 3.412204234122042,
2276
+ "grad_norm": 7.90625,
2277
+ "learning_rate": 8.067777777777778e-05,
2278
+ "loss": 0.6156,
2279
+ "step": 2740
2280
+ },
2281
+ {
2282
+ "epoch": 3.4246575342465753,
2283
+ "grad_norm": 10.8125,
2284
+ "learning_rate": 8.056666666666667e-05,
2285
+ "loss": 0.6308,
2286
+ "step": 2750
2287
+ },
2288
+ {
2289
+ "epoch": 3.4371108343711083,
2290
+ "grad_norm": 10.5,
2291
+ "learning_rate": 8.045555555555557e-05,
2292
+ "loss": 0.7397,
2293
+ "step": 2760
2294
+ },
2295
+ {
2296
+ "epoch": 3.4495641344956414,
2297
+ "grad_norm": 10.1875,
2298
+ "learning_rate": 8.034444444444444e-05,
2299
+ "loss": 0.5898,
2300
+ "step": 2770
2301
+ },
2302
+ {
2303
+ "epoch": 3.4620174346201744,
2304
+ "grad_norm": 10.375,
2305
+ "learning_rate": 8.023333333333334e-05,
2306
+ "loss": 0.663,
2307
+ "step": 2780
2308
+ },
2309
+ {
2310
+ "epoch": 3.4744707347447075,
2311
+ "grad_norm": 9.3125,
2312
+ "learning_rate": 8.012222222222222e-05,
2313
+ "loss": 0.7272,
2314
+ "step": 2790
2315
+ },
2316
+ {
2317
+ "epoch": 3.4869240348692405,
2318
+ "grad_norm": 9.375,
2319
+ "learning_rate": 8.001111111111112e-05,
2320
+ "loss": 0.6591,
2321
+ "step": 2800
2322
+ },
2323
+ {
2324
+ "epoch": 3.4869240348692405,
2325
+ "eval/acc": 37.20930099487305,
2326
+ "step": 2800
2327
+ },
2328
+ {
2329
+ "epoch": 3.4869240348692405,
2330
+ "eval_loss": 2.223583698272705,
2331
+ "eval_runtime": 0.2188,
2332
+ "eval_samples_per_second": 196.518,
2333
+ "eval_steps_per_second": 4.57,
2334
+ "step": 2800
2335
+ },
2336
+ {
2337
+ "epoch": 3.499377334993773,
2338
+ "grad_norm": 10.4375,
2339
+ "learning_rate": 7.99e-05,
2340
+ "loss": 0.6579,
2341
+ "step": 2810
2342
+ },
2343
+ {
2344
+ "epoch": 3.511830635118306,
2345
+ "grad_norm": 8.0625,
2346
+ "learning_rate": 7.978888888888889e-05,
2347
+ "loss": 0.5828,
2348
+ "step": 2820
2349
+ },
2350
+ {
2351
+ "epoch": 3.5242839352428392,
2352
+ "grad_norm": 10.8125,
2353
+ "learning_rate": 7.967777777777779e-05,
2354
+ "loss": 0.6624,
2355
+ "step": 2830
2356
+ },
2357
+ {
2358
+ "epoch": 3.5367372353673723,
2359
+ "grad_norm": 8.6875,
2360
+ "learning_rate": 7.956666666666667e-05,
2361
+ "loss": 0.6394,
2362
+ "step": 2840
2363
+ },
2364
+ {
2365
+ "epoch": 3.5491905354919053,
2366
+ "grad_norm": 8.4375,
2367
+ "learning_rate": 7.945555555555556e-05,
2368
+ "loss": 0.6937,
2369
+ "step": 2850
2370
+ },
2371
+ {
2372
+ "epoch": 3.5616438356164384,
2373
+ "grad_norm": 9.1875,
2374
+ "learning_rate": 7.934444444444444e-05,
2375
+ "loss": 0.7377,
2376
+ "step": 2860
2377
+ },
2378
+ {
2379
+ "epoch": 3.5740971357409714,
2380
+ "grad_norm": 8.9375,
2381
+ "learning_rate": 7.923333333333334e-05,
2382
+ "loss": 0.6374,
2383
+ "step": 2870
2384
+ },
2385
+ {
2386
+ "epoch": 3.5865504358655045,
2387
+ "grad_norm": 11.25,
2388
+ "learning_rate": 7.912222222222224e-05,
2389
+ "loss": 0.6606,
2390
+ "step": 2880
2391
+ },
2392
+ {
2393
+ "epoch": 3.5990037359900375,
2394
+ "grad_norm": 11.0,
2395
+ "learning_rate": 7.901111111111111e-05,
2396
+ "loss": 0.692,
2397
+ "step": 2890
2398
+ },
2399
+ {
2400
+ "epoch": 3.6114570361145706,
2401
+ "grad_norm": 8.375,
2402
+ "learning_rate": 7.890000000000001e-05,
2403
+ "loss": 0.6431,
2404
+ "step": 2900
2405
+ },
2406
+ {
2407
+ "epoch": 3.6114570361145706,
2408
+ "eval/acc": 41.86046600341797,
2409
+ "step": 2900
2410
+ },
2411
+ {
2412
+ "epoch": 3.6114570361145706,
2413
+ "eval_loss": 2.137084722518921,
2414
+ "eval_runtime": 0.2198,
2415
+ "eval_samples_per_second": 195.618,
2416
+ "eval_steps_per_second": 4.549,
2417
+ "step": 2900
2418
+ },
2419
+ {
2420
+ "epoch": 3.6239103362391036,
2421
+ "grad_norm": 32.25,
2422
+ "learning_rate": 7.878888888888889e-05,
2423
+ "loss": 0.6968,
2424
+ "step": 2910
2425
+ },
2426
+ {
2427
+ "epoch": 3.6363636363636362,
2428
+ "grad_norm": 9.25,
2429
+ "learning_rate": 7.867777777777779e-05,
2430
+ "loss": 0.7186,
2431
+ "step": 2920
2432
+ },
2433
+ {
2434
+ "epoch": 3.6488169364881693,
2435
+ "grad_norm": 10.0625,
2436
+ "learning_rate": 7.856666666666666e-05,
2437
+ "loss": 0.5565,
2438
+ "step": 2930
2439
+ },
2440
+ {
2441
+ "epoch": 3.6612702366127023,
2442
+ "grad_norm": 8.4375,
2443
+ "learning_rate": 7.845555555555556e-05,
2444
+ "loss": 0.6036,
2445
+ "step": 2940
2446
+ },
2447
+ {
2448
+ "epoch": 3.6737235367372354,
2449
+ "grad_norm": 10.6875,
2450
+ "learning_rate": 7.834444444444446e-05,
2451
+ "loss": 0.6465,
2452
+ "step": 2950
2453
+ },
2454
+ {
2455
+ "epoch": 3.6861768368617684,
2456
+ "grad_norm": 11.1875,
2457
+ "learning_rate": 7.823333333333334e-05,
2458
+ "loss": 0.803,
2459
+ "step": 2960
2460
+ },
2461
+ {
2462
+ "epoch": 3.6986301369863015,
2463
+ "grad_norm": 9.9375,
2464
+ "learning_rate": 7.812222222222223e-05,
2465
+ "loss": 0.5897,
2466
+ "step": 2970
2467
+ },
2468
+ {
2469
+ "epoch": 3.711083437110834,
2470
+ "grad_norm": 9.1875,
2471
+ "learning_rate": 7.801111111111111e-05,
2472
+ "loss": 0.6373,
2473
+ "step": 2980
2474
+ },
2475
+ {
2476
+ "epoch": 3.723536737235367,
2477
+ "grad_norm": 7.9375,
2478
+ "learning_rate": 7.790000000000001e-05,
2479
+ "loss": 0.696,
2480
+ "step": 2990
2481
+ },
2482
+ {
2483
+ "epoch": 3.7359900373599,
2484
+ "grad_norm": 8.0625,
2485
+ "learning_rate": 7.77888888888889e-05,
2486
+ "loss": 0.612,
2487
+ "step": 3000
2488
+ },
2489
+ {
2490
+ "epoch": 3.7359900373599,
2491
+ "eval/acc": 37.20930099487305,
2492
+ "step": 3000
2493
+ },
2494
+ {
2495
+ "epoch": 3.7359900373599,
2496
+ "eval_loss": 2.2000465393066406,
2497
+ "eval_runtime": 0.2179,
2498
+ "eval_samples_per_second": 197.339,
2499
+ "eval_steps_per_second": 4.589,
2500
+ "step": 3000
2501
+ },
2502
+ {
2503
+ "epoch": 3.7484433374844333,
2504
+ "grad_norm": 8.8125,
2505
+ "learning_rate": 7.767777777777778e-05,
2506
+ "loss": 0.687,
2507
+ "step": 3010
2508
+ },
2509
+ {
2510
+ "epoch": 3.7608966376089663,
2511
+ "grad_norm": 11.875,
2512
+ "learning_rate": 7.756666666666666e-05,
2513
+ "loss": 0.706,
2514
+ "step": 3020
2515
+ },
2516
+ {
2517
+ "epoch": 3.7733499377334994,
2518
+ "grad_norm": 9.9375,
2519
+ "learning_rate": 7.745555555555556e-05,
2520
+ "loss": 0.7025,
2521
+ "step": 3030
2522
+ },
2523
+ {
2524
+ "epoch": 3.7858032378580324,
2525
+ "grad_norm": 8.25,
2526
+ "learning_rate": 7.734444444444445e-05,
2527
+ "loss": 0.6412,
2528
+ "step": 3040
2529
+ },
2530
+ {
2531
+ "epoch": 3.7982565379825655,
2532
+ "grad_norm": 9.5625,
2533
+ "learning_rate": 7.723333333333333e-05,
2534
+ "loss": 0.6873,
2535
+ "step": 3050
2536
+ },
2537
+ {
2538
+ "epoch": 3.8107098381070985,
2539
+ "grad_norm": 9.9375,
2540
+ "learning_rate": 7.712222222222223e-05,
2541
+ "loss": 0.6956,
2542
+ "step": 3060
2543
+ },
2544
+ {
2545
+ "epoch": 3.8231631382316316,
2546
+ "grad_norm": 7.46875,
2547
+ "learning_rate": 7.701111111111111e-05,
2548
+ "loss": 0.6178,
2549
+ "step": 3070
2550
+ },
2551
+ {
2552
+ "epoch": 3.8356164383561646,
2553
+ "grad_norm": 8.75,
2554
+ "learning_rate": 7.69e-05,
2555
+ "loss": 0.6322,
2556
+ "step": 3080
2557
+ },
2558
+ {
2559
+ "epoch": 3.848069738480697,
2560
+ "grad_norm": 7.375,
2561
+ "learning_rate": 7.678888888888888e-05,
2562
+ "loss": 0.6457,
2563
+ "step": 3090
2564
+ },
2565
+ {
2566
+ "epoch": 3.8605230386052303,
2567
+ "grad_norm": 7.84375,
2568
+ "learning_rate": 7.667777777777778e-05,
2569
+ "loss": 0.6358,
2570
+ "step": 3100
2571
+ },
2572
+ {
2573
+ "epoch": 3.8605230386052303,
2574
+ "eval/acc": 46.511627197265625,
2575
+ "step": 3100
2576
+ },
2577
+ {
2578
+ "epoch": 3.8605230386052303,
2579
+ "eval_loss": 2.1861517429351807,
2580
+ "eval_runtime": 0.2234,
2581
+ "eval_samples_per_second": 192.489,
2582
+ "eval_steps_per_second": 4.476,
2583
+ "step": 3100
2584
+ },
2585
+ {
2586
+ "epoch": 3.8729763387297633,
2587
+ "grad_norm": 8.875,
2588
+ "learning_rate": 7.656666666666668e-05,
2589
+ "loss": 0.6032,
2590
+ "step": 3110
2591
+ },
2592
+ {
2593
+ "epoch": 3.8854296388542964,
2594
+ "grad_norm": 9.75,
2595
+ "learning_rate": 7.645555555555556e-05,
2596
+ "loss": 0.6119,
2597
+ "step": 3120
2598
+ },
2599
+ {
2600
+ "epoch": 3.8978829389788294,
2601
+ "grad_norm": 9.8125,
2602
+ "learning_rate": 7.634444444444445e-05,
2603
+ "loss": 0.6792,
2604
+ "step": 3130
2605
+ },
2606
+ {
2607
+ "epoch": 3.9103362391033625,
2608
+ "grad_norm": 9.0,
2609
+ "learning_rate": 7.623333333333333e-05,
2610
+ "loss": 0.6191,
2611
+ "step": 3140
2612
+ },
2613
+ {
2614
+ "epoch": 3.9227895392278955,
2615
+ "grad_norm": 8.8125,
2616
+ "learning_rate": 7.612222222222223e-05,
2617
+ "loss": 0.6312,
2618
+ "step": 3150
2619
+ },
2620
+ {
2621
+ "epoch": 3.935242839352428,
2622
+ "grad_norm": 8.0,
2623
+ "learning_rate": 7.601111111111112e-05,
2624
+ "loss": 0.6148,
2625
+ "step": 3160
2626
+ },
2627
+ {
2628
+ "epoch": 3.947696139476961,
2629
+ "grad_norm": 8.5,
2630
+ "learning_rate": 7.59e-05,
2631
+ "loss": 0.6006,
2632
+ "step": 3170
2633
+ },
2634
+ {
2635
+ "epoch": 3.9601494396014942,
2636
+ "grad_norm": 8.375,
2637
+ "learning_rate": 7.578888888888889e-05,
2638
+ "loss": 0.6129,
2639
+ "step": 3180
2640
+ },
2641
+ {
2642
+ "epoch": 3.9726027397260273,
2643
+ "grad_norm": 9.3125,
2644
+ "learning_rate": 7.567777777777778e-05,
2645
+ "loss": 0.6801,
2646
+ "step": 3190
2647
+ },
2648
+ {
2649
+ "epoch": 3.9850560398505603,
2650
+ "grad_norm": 10.0,
2651
+ "learning_rate": 7.556666666666667e-05,
2652
+ "loss": 0.7283,
2653
+ "step": 3200
2654
+ },
2655
+ {
2656
+ "epoch": 3.9850560398505603,
2657
+ "eval/acc": 44.1860466003418,
2658
+ "step": 3200
2659
+ },
2660
+ {
2661
+ "epoch": 3.9850560398505603,
2662
+ "eval_loss": 2.154561758041382,
2663
+ "eval_runtime": 0.2194,
2664
+ "eval_samples_per_second": 196.004,
2665
+ "eval_steps_per_second": 4.558,
2666
+ "step": 3200
2667
+ },
2668
+ {
2669
+ "epoch": 3.9975093399750934,
2670
+ "grad_norm": 6.78125,
2671
+ "learning_rate": 7.545555555555555e-05,
2672
+ "loss": 0.6551,
2673
+ "step": 3210
2674
+ },
2675
+ {
2676
+ "epoch": 4.009962640099626,
2677
+ "grad_norm": 7.15625,
2678
+ "learning_rate": 7.534444444444445e-05,
2679
+ "loss": 0.6154,
2680
+ "step": 3220
2681
+ },
2682
+ {
2683
+ "epoch": 4.0224159402241595,
2684
+ "grad_norm": 8.3125,
2685
+ "learning_rate": 7.523333333333334e-05,
2686
+ "loss": 0.5325,
2687
+ "step": 3230
2688
+ },
2689
+ {
2690
+ "epoch": 4.0348692403486925,
2691
+ "grad_norm": 12.125,
2692
+ "learning_rate": 7.512222222222222e-05,
2693
+ "loss": 0.6045,
2694
+ "step": 3240
2695
+ },
2696
+ {
2697
+ "epoch": 4.047322540473226,
2698
+ "grad_norm": 8.875,
2699
+ "learning_rate": 7.50111111111111e-05,
2700
+ "loss": 0.6101,
2701
+ "step": 3250
2702
+ },
2703
+ {
2704
+ "epoch": 4.059775840597759,
2705
+ "grad_norm": 8.9375,
2706
+ "learning_rate": 7.49e-05,
2707
+ "loss": 0.6041,
2708
+ "step": 3260
2709
+ },
2710
+ {
2711
+ "epoch": 4.072229140722292,
2712
+ "grad_norm": 13.9375,
2713
+ "learning_rate": 7.47888888888889e-05,
2714
+ "loss": 0.5396,
2715
+ "step": 3270
2716
+ },
2717
+ {
2718
+ "epoch": 4.084682440846825,
2719
+ "grad_norm": 10.4375,
2720
+ "learning_rate": 7.467777777777777e-05,
2721
+ "loss": 0.5914,
2722
+ "step": 3280
2723
+ },
2724
+ {
2725
+ "epoch": 4.097135740971358,
2726
+ "grad_norm": 8.4375,
2727
+ "learning_rate": 7.456666666666667e-05,
2728
+ "loss": 0.5281,
2729
+ "step": 3290
2730
+ },
2731
+ {
2732
+ "epoch": 4.109589041095891,
2733
+ "grad_norm": 10.0,
2734
+ "learning_rate": 7.445555555555556e-05,
2735
+ "loss": 0.6078,
2736
+ "step": 3300
2737
+ },
2738
+ {
2739
+ "epoch": 4.109589041095891,
2740
+ "eval/acc": 32.55813980102539,
2741
+ "step": 3300
2742
+ },
2743
+ {
2744
+ "epoch": 4.109589041095891,
2745
+ "eval_loss": 2.71657133102417,
2746
+ "eval_runtime": 2.577,
2747
+ "eval_samples_per_second": 16.686,
2748
+ "eval_steps_per_second": 0.388,
2749
+ "step": 3300
2750
+ },
2751
+ {
2752
+ "epoch": 4.122042341220423,
2753
+ "grad_norm": 9.5,
2754
+ "learning_rate": 7.434444444444446e-05,
2755
+ "loss": 0.6107,
2756
+ "step": 3310
2757
+ },
2758
+ {
2759
+ "epoch": 4.134495641344956,
2760
+ "grad_norm": 8.375,
2761
+ "learning_rate": 7.423333333333333e-05,
2762
+ "loss": 0.5635,
2763
+ "step": 3320
2764
+ },
2765
+ {
2766
+ "epoch": 4.146948941469489,
2767
+ "grad_norm": 9.4375,
2768
+ "learning_rate": 7.412222222222222e-05,
2769
+ "loss": 0.6277,
2770
+ "step": 3330
2771
+ },
2772
+ {
2773
+ "epoch": 4.159402241594022,
2774
+ "grad_norm": 10.0,
2775
+ "learning_rate": 7.401111111111112e-05,
2776
+ "loss": 0.71,
2777
+ "step": 3340
2778
+ },
2779
+ {
2780
+ "epoch": 4.171855541718555,
2781
+ "grad_norm": 9.625,
2782
+ "learning_rate": 7.390000000000001e-05,
2783
+ "loss": 0.5073,
2784
+ "step": 3350
2785
+ },
2786
+ {
2787
+ "epoch": 4.184308841843088,
2788
+ "grad_norm": 11.125,
2789
+ "learning_rate": 7.378888888888889e-05,
2790
+ "loss": 0.613,
2791
+ "step": 3360
2792
+ },
2793
+ {
2794
+ "epoch": 4.196762141967621,
2795
+ "grad_norm": 9.0625,
2796
+ "learning_rate": 7.367777777777778e-05,
2797
+ "loss": 0.6214,
2798
+ "step": 3370
2799
+ },
2800
+ {
2801
+ "epoch": 4.209215442092154,
2802
+ "grad_norm": 7.75,
2803
+ "learning_rate": 7.356666666666667e-05,
2804
+ "loss": 0.5957,
2805
+ "step": 3380
2806
+ },
2807
+ {
2808
+ "epoch": 4.221668742216687,
2809
+ "grad_norm": 7.71875,
2810
+ "learning_rate": 7.345555555555556e-05,
2811
+ "loss": 0.5365,
2812
+ "step": 3390
2813
+ },
2814
+ {
2815
+ "epoch": 4.2341220423412205,
2816
+ "grad_norm": 7.5,
2817
+ "learning_rate": 7.334444444444444e-05,
2818
+ "loss": 0.4691,
2819
+ "step": 3400
2820
+ },
2821
+ {
2822
+ "epoch": 4.2341220423412205,
2823
+ "eval/acc": 37.20930099487305,
2824
+ "step": 3400
2825
+ },
2826
+ {
2827
+ "epoch": 4.2341220423412205,
2828
+ "eval_loss": 2.6280057430267334,
2829
+ "eval_runtime": 0.2305,
2830
+ "eval_samples_per_second": 186.512,
2831
+ "eval_steps_per_second": 4.337,
2832
+ "step": 3400
2833
+ },
2834
+ {
2835
+ "epoch": 4.2465753424657535,
2836
+ "grad_norm": 8.5,
2837
+ "learning_rate": 7.323333333333333e-05,
2838
+ "loss": 0.6429,
2839
+ "step": 3410
2840
+ },
2841
+ {
2842
+ "epoch": 4.259028642590287,
2843
+ "grad_norm": 8.625,
2844
+ "learning_rate": 7.312222222222223e-05,
2845
+ "loss": 0.5898,
2846
+ "step": 3420
2847
+ },
2848
+ {
2849
+ "epoch": 4.27148194271482,
2850
+ "grad_norm": 8.375,
2851
+ "learning_rate": 7.301111111111113e-05,
2852
+ "loss": 0.6207,
2853
+ "step": 3430
2854
+ },
2855
+ {
2856
+ "epoch": 4.283935242839353,
2857
+ "grad_norm": 7.9375,
2858
+ "learning_rate": 7.29e-05,
2859
+ "loss": 0.5395,
2860
+ "step": 3440
2861
+ },
2862
+ {
2863
+ "epoch": 4.296388542963886,
2864
+ "grad_norm": 7.25,
2865
+ "learning_rate": 7.27888888888889e-05,
2866
+ "loss": 0.5836,
2867
+ "step": 3450
2868
+ },
2869
+ {
2870
+ "epoch": 4.308841843088419,
2871
+ "grad_norm": 8.5,
2872
+ "learning_rate": 7.267777777777778e-05,
2873
+ "loss": 0.661,
2874
+ "step": 3460
2875
+ },
2876
+ {
2877
+ "epoch": 4.321295143212952,
2878
+ "grad_norm": 11.1875,
2879
+ "learning_rate": 7.256666666666668e-05,
2880
+ "loss": 0.5491,
2881
+ "step": 3470
2882
+ },
2883
+ {
2884
+ "epoch": 4.333748443337484,
2885
+ "grad_norm": 9.25,
2886
+ "learning_rate": 7.245555555555555e-05,
2887
+ "loss": 0.5489,
2888
+ "step": 3480
2889
+ },
2890
+ {
2891
+ "epoch": 4.346201743462017,
2892
+ "grad_norm": 10.0,
2893
+ "learning_rate": 7.234444444444445e-05,
2894
+ "loss": 0.6114,
2895
+ "step": 3490
2896
+ },
2897
+ {
2898
+ "epoch": 4.35865504358655,
2899
+ "grad_norm": 10.9375,
2900
+ "learning_rate": 7.223333333333335e-05,
2901
+ "loss": 0.5994,
2902
+ "step": 3500
2903
+ },
2904
+ {
2905
+ "epoch": 4.35865504358655,
2906
+ "eval/acc": 37.20930099487305,
2907
+ "step": 3500
2908
+ },
2909
+ {
2910
+ "epoch": 4.35865504358655,
2911
+ "eval_loss": 2.7129628658294678,
2912
+ "eval_runtime": 0.2205,
2913
+ "eval_samples_per_second": 194.97,
2914
+ "eval_steps_per_second": 4.534,
2915
+ "step": 3500
2916
+ },
2917
+ {
2918
+ "epoch": 4.371108343711083,
2919
+ "grad_norm": 11.0625,
2920
+ "learning_rate": 7.212222222222223e-05,
2921
+ "loss": 0.6538,
2922
+ "step": 3510
2923
+ },
2924
+ {
2925
+ "epoch": 4.383561643835616,
2926
+ "grad_norm": 8.875,
2927
+ "learning_rate": 7.201111111111111e-05,
2928
+ "loss": 0.6445,
2929
+ "step": 3520
2930
+ },
2931
+ {
2932
+ "epoch": 4.396014943960149,
2933
+ "grad_norm": 8.0625,
2934
+ "learning_rate": 7.19e-05,
2935
+ "loss": 0.5865,
2936
+ "step": 3530
2937
+ },
2938
+ {
2939
+ "epoch": 4.408468244084682,
2940
+ "grad_norm": 7.84375,
2941
+ "learning_rate": 7.17888888888889e-05,
2942
+ "loss": 0.5756,
2943
+ "step": 3540
2944
+ },
2945
+ {
2946
+ "epoch": 4.420921544209215,
2947
+ "grad_norm": 9.6875,
2948
+ "learning_rate": 7.167777777777778e-05,
2949
+ "loss": 0.599,
2950
+ "step": 3550
2951
+ },
2952
+ {
2953
+ "epoch": 4.433374844333748,
2954
+ "grad_norm": 7.71875,
2955
+ "learning_rate": 7.156666666666667e-05,
2956
+ "loss": 0.5953,
2957
+ "step": 3560
2958
+ },
2959
+ {
2960
+ "epoch": 4.4458281444582815,
2961
+ "grad_norm": 8.375,
2962
+ "learning_rate": 7.145555555555557e-05,
2963
+ "loss": 0.5322,
2964
+ "step": 3570
2965
+ },
2966
+ {
2967
+ "epoch": 4.4582814445828145,
2968
+ "grad_norm": 12.0625,
2969
+ "learning_rate": 7.134444444444445e-05,
2970
+ "loss": 0.6048,
2971
+ "step": 3580
2972
+ },
2973
+ {
2974
+ "epoch": 4.4707347447073476,
2975
+ "grad_norm": 91.5,
2976
+ "learning_rate": 7.123333333333333e-05,
2977
+ "loss": 0.601,
2978
+ "step": 3590
2979
+ },
2980
+ {
2981
+ "epoch": 4.483188044831881,
2982
+ "grad_norm": 9.5625,
2983
+ "learning_rate": 7.112222222222222e-05,
2984
+ "loss": 0.565,
2985
+ "step": 3600
2986
+ },
2987
+ {
2988
+ "epoch": 4.483188044831881,
2989
+ "eval/acc": 37.20930099487305,
2990
+ "step": 3600
2991
+ },
2992
+ {
2993
+ "epoch": 4.483188044831881,
2994
+ "eval_loss": 2.709101438522339,
2995
+ "eval_runtime": 5.5657,
2996
+ "eval_samples_per_second": 7.726,
2997
+ "eval_steps_per_second": 0.18,
2998
+ "step": 3600
2999
+ },
3000
+ {
3001
+ "epoch": 4.495641344956414,
3002
+ "grad_norm": 9.875,
3003
+ "learning_rate": 7.101111111111112e-05,
3004
+ "loss": 0.6184,
3005
+ "step": 3610
3006
+ },
3007
+ {
3008
+ "epoch": 4.508094645080947,
3009
+ "grad_norm": 11.3125,
3010
+ "learning_rate": 7.09e-05,
3011
+ "loss": 0.5582,
3012
+ "step": 3620
3013
+ },
3014
+ {
3015
+ "epoch": 4.52054794520548,
3016
+ "grad_norm": 10.3125,
3017
+ "learning_rate": 7.078888888888889e-05,
3018
+ "loss": 0.5685,
3019
+ "step": 3630
3020
+ },
3021
+ {
3022
+ "epoch": 4.533001245330013,
3023
+ "grad_norm": 9.8125,
3024
+ "learning_rate": 7.067777777777777e-05,
3025
+ "loss": 0.6142,
3026
+ "step": 3640
3027
+ },
3028
+ {
3029
+ "epoch": 4.545454545454545,
3030
+ "grad_norm": 36.0,
3031
+ "learning_rate": 7.056666666666667e-05,
3032
+ "loss": 0.5638,
3033
+ "step": 3650
3034
+ },
3035
+ {
3036
+ "epoch": 4.557907845579079,
3037
+ "grad_norm": 9.75,
3038
+ "learning_rate": 7.045555555555557e-05,
3039
+ "loss": 0.5066,
3040
+ "step": 3660
3041
+ },
3042
+ {
3043
+ "epoch": 4.570361145703611,
3044
+ "grad_norm": 12.3125,
3045
+ "learning_rate": 7.034444444444445e-05,
3046
+ "loss": 0.5949,
3047
+ "step": 3670
3048
+ },
3049
+ {
3050
+ "epoch": 4.582814445828144,
3051
+ "grad_norm": 9.1875,
3052
+ "learning_rate": 7.023333333333334e-05,
3053
+ "loss": 0.5913,
3054
+ "step": 3680
3055
+ },
3056
+ {
3057
+ "epoch": 4.595267745952677,
3058
+ "grad_norm": 9.25,
3059
+ "learning_rate": 7.012222222222222e-05,
3060
+ "loss": 0.6607,
3061
+ "step": 3690
3062
+ },
3063
+ {
3064
+ "epoch": 4.60772104607721,
3065
+ "grad_norm": 10.25,
3066
+ "learning_rate": 7.001111111111112e-05,
3067
+ "loss": 0.5912,
3068
+ "step": 3700
3069
+ },
3070
+ {
3071
+ "epoch": 4.60772104607721,
3072
+ "eval/acc": 34.88372039794922,
3073
+ "step": 3700
3074
+ },
3075
+ {
3076
+ "epoch": 4.60772104607721,
3077
+ "eval_loss": 2.7723538875579834,
3078
+ "eval_runtime": 0.2246,
3079
+ "eval_samples_per_second": 191.465,
3080
+ "eval_steps_per_second": 4.453,
3081
+ "step": 3700
3082
+ },
3083
+ {
3084
+ "epoch": 4.620174346201743,
3085
+ "grad_norm": 10.875,
3086
+ "learning_rate": 6.99e-05,
3087
+ "loss": 0.6132,
3088
+ "step": 3710
3089
+ },
3090
+ {
3091
+ "epoch": 4.632627646326276,
3092
+ "grad_norm": 11.125,
3093
+ "learning_rate": 6.978888888888889e-05,
3094
+ "loss": 0.5569,
3095
+ "step": 3720
3096
+ },
3097
+ {
3098
+ "epoch": 4.645080946450809,
3099
+ "grad_norm": 13.75,
3100
+ "learning_rate": 6.967777777777779e-05,
3101
+ "loss": 0.5752,
3102
+ "step": 3730
3103
+ },
3104
+ {
3105
+ "epoch": 4.657534246575342,
3106
+ "grad_norm": 10.6875,
3107
+ "learning_rate": 6.956666666666667e-05,
3108
+ "loss": 0.584,
3109
+ "step": 3740
3110
+ },
3111
+ {
3112
+ "epoch": 4.6699875466998755,
3113
+ "grad_norm": 8.5,
3114
+ "learning_rate": 6.945555555555556e-05,
3115
+ "loss": 0.5779,
3116
+ "step": 3750
3117
+ },
3118
+ {
3119
+ "epoch": 4.6824408468244085,
3120
+ "grad_norm": 8.375,
3121
+ "learning_rate": 6.934444444444444e-05,
3122
+ "loss": 0.5435,
3123
+ "step": 3760
3124
+ },
3125
+ {
3126
+ "epoch": 4.694894146948942,
3127
+ "grad_norm": 10.1875,
3128
+ "learning_rate": 6.923333333333334e-05,
3129
+ "loss": 0.6124,
3130
+ "step": 3770
3131
+ },
3132
+ {
3133
+ "epoch": 4.707347447073475,
3134
+ "grad_norm": 8.5,
3135
+ "learning_rate": 6.912222222222222e-05,
3136
+ "loss": 0.6087,
3137
+ "step": 3780
3138
+ },
3139
+ {
3140
+ "epoch": 4.719800747198008,
3141
+ "grad_norm": 10.625,
3142
+ "learning_rate": 6.901111111111111e-05,
3143
+ "loss": 0.5831,
3144
+ "step": 3790
3145
+ },
3146
+ {
3147
+ "epoch": 4.732254047322541,
3148
+ "grad_norm": 7.84375,
3149
+ "learning_rate": 6.89e-05,
3150
+ "loss": 0.5711,
3151
+ "step": 3800
3152
+ },
3153
+ {
3154
+ "epoch": 4.732254047322541,
3155
+ "eval/acc": 34.88372039794922,
3156
+ "step": 3800
3157
+ },
3158
+ {
3159
+ "epoch": 4.732254047322541,
3160
+ "eval_loss": 2.7747089862823486,
3161
+ "eval_runtime": 0.3882,
3162
+ "eval_samples_per_second": 110.773,
3163
+ "eval_steps_per_second": 2.576,
3164
+ "step": 3800
3165
+ },
3166
+ {
3167
+ "epoch": 4.744707347447074,
3168
+ "grad_norm": 8.25,
3169
+ "learning_rate": 6.878888888888889e-05,
3170
+ "loss": 0.617,
3171
+ "step": 3810
3172
+ },
3173
+ {
3174
+ "epoch": 4.757160647571607,
3175
+ "grad_norm": 10.125,
3176
+ "learning_rate": 6.867777777777779e-05,
3177
+ "loss": 0.5471,
3178
+ "step": 3820
3179
+ },
3180
+ {
3181
+ "epoch": 4.76961394769614,
3182
+ "grad_norm": 10.8125,
3183
+ "learning_rate": 6.856666666666666e-05,
3184
+ "loss": 0.6168,
3185
+ "step": 3830
3186
+ },
3187
+ {
3188
+ "epoch": 4.782067247820672,
3189
+ "grad_norm": 10.5,
3190
+ "learning_rate": 6.845555555555556e-05,
3191
+ "loss": 0.6136,
3192
+ "step": 3840
3193
+ },
3194
+ {
3195
+ "epoch": 4.794520547945205,
3196
+ "grad_norm": 12.1875,
3197
+ "learning_rate": 6.834444444444444e-05,
3198
+ "loss": 0.6066,
3199
+ "step": 3850
3200
+ },
3201
+ {
3202
+ "epoch": 4.806973848069738,
3203
+ "grad_norm": 11.125,
3204
+ "learning_rate": 6.823333333333334e-05,
3205
+ "loss": 0.6457,
3206
+ "step": 3860
3207
+ },
3208
+ {
3209
+ "epoch": 4.819427148194271,
3210
+ "grad_norm": 12.375,
3211
+ "learning_rate": 6.812222222222221e-05,
3212
+ "loss": 0.6109,
3213
+ "step": 3870
3214
+ },
3215
+ {
3216
+ "epoch": 4.831880448318804,
3217
+ "grad_norm": 10.0625,
3218
+ "learning_rate": 6.801111111111111e-05,
3219
+ "loss": 0.6063,
3220
+ "step": 3880
3221
+ },
3222
+ {
3223
+ "epoch": 4.844333748443337,
3224
+ "grad_norm": 8.1875,
3225
+ "learning_rate": 6.790000000000001e-05,
3226
+ "loss": 0.5681,
3227
+ "step": 3890
3228
+ },
3229
+ {
3230
+ "epoch": 4.85678704856787,
3231
+ "grad_norm": 24.125,
3232
+ "learning_rate": 6.77888888888889e-05,
3233
+ "loss": 0.5041,
3234
+ "step": 3900
3235
+ },
3236
+ {
3237
+ "epoch": 4.85678704856787,
3238
+ "eval/acc": 32.55813980102539,
3239
+ "step": 3900
3240
+ },
3241
+ {
3242
+ "epoch": 4.85678704856787,
3243
+ "eval_loss": 2.7735087871551514,
3244
+ "eval_runtime": 0.2177,
3245
+ "eval_samples_per_second": 197.481,
3246
+ "eval_steps_per_second": 4.593,
3247
+ "step": 3900
3248
+ },
3249
+ {
3250
+ "epoch": 4.869240348692403,
3251
+ "grad_norm": 10.875,
3252
+ "learning_rate": 6.767777777777778e-05,
3253
+ "loss": 0.5809,
3254
+ "step": 3910
3255
+ },
3256
+ {
3257
+ "epoch": 4.8816936488169365,
3258
+ "grad_norm": 9.0625,
3259
+ "learning_rate": 6.756666666666666e-05,
3260
+ "loss": 0.5673,
3261
+ "step": 3920
3262
+ },
3263
+ {
3264
+ "epoch": 4.8941469489414695,
3265
+ "grad_norm": 8.5625,
3266
+ "learning_rate": 6.745555555555556e-05,
3267
+ "loss": 0.625,
3268
+ "step": 3930
3269
+ },
3270
+ {
3271
+ "epoch": 4.906600249066003,
3272
+ "grad_norm": 7.625,
3273
+ "learning_rate": 6.734444444444445e-05,
3274
+ "loss": 0.541,
3275
+ "step": 3940
3276
+ },
3277
+ {
3278
+ "epoch": 4.919053549190536,
3279
+ "grad_norm": 8.75,
3280
+ "learning_rate": 6.723333333333333e-05,
3281
+ "loss": 0.5742,
3282
+ "step": 3950
3283
+ },
3284
+ {
3285
+ "epoch": 4.931506849315069,
3286
+ "grad_norm": 12.1875,
3287
+ "learning_rate": 6.712222222222223e-05,
3288
+ "loss": 0.5545,
3289
+ "step": 3960
3290
+ },
3291
+ {
3292
+ "epoch": 4.943960149439602,
3293
+ "grad_norm": 13.1875,
3294
+ "learning_rate": 6.701111111111112e-05,
3295
+ "loss": 0.6346,
3296
+ "step": 3970
3297
+ },
3298
+ {
3299
+ "epoch": 4.956413449564135,
3300
+ "grad_norm": 11.6875,
3301
+ "learning_rate": 6.690000000000001e-05,
3302
+ "loss": 0.5706,
3303
+ "step": 3980
3304
+ },
3305
+ {
3306
+ "epoch": 4.968866749688668,
3307
+ "grad_norm": 8.375,
3308
+ "learning_rate": 6.678888888888888e-05,
3309
+ "loss": 0.5641,
3310
+ "step": 3990
3311
+ },
3312
+ {
3313
+ "epoch": 4.981320049813201,
3314
+ "grad_norm": 9.9375,
3315
+ "learning_rate": 6.667777777777778e-05,
3316
+ "loss": 0.5201,
3317
+ "step": 4000
3318
+ },
3319
+ {
3320
+ "epoch": 4.981320049813201,
3321
+ "eval/acc": 23.255813598632812,
3322
+ "step": 4000
3323
+ },
3324
+ {
3325
+ "epoch": 4.981320049813201,
3326
+ "eval_loss": 2.779590606689453,
3327
+ "eval_runtime": 0.2285,
3328
+ "eval_samples_per_second": 188.191,
3329
+ "eval_steps_per_second": 4.377,
3330
+ "step": 4000
3331
+ },
3332
+ {
3333
+ "epoch": 4.993773349937733,
3334
+ "grad_norm": 11.1875,
3335
+ "learning_rate": 6.656666666666667e-05,
3336
+ "loss": 0.6364,
3337
+ "step": 4010
3338
+ },
3339
+ {
3340
+ "epoch": 5.006226650062266,
3341
+ "grad_norm": 8.3125,
3342
+ "learning_rate": 6.645555555555557e-05,
3343
+ "loss": 0.5594,
3344
+ "step": 4020
3345
+ },
3346
+ {
3347
+ "epoch": 5.018679950186799,
3348
+ "grad_norm": 8.25,
3349
+ "learning_rate": 6.634444444444444e-05,
3350
+ "loss": 0.4926,
3351
+ "step": 4030
3352
+ },
3353
+ {
3354
+ "epoch": 5.031133250311332,
3355
+ "grad_norm": 9.25,
3356
+ "learning_rate": 6.623333333333334e-05,
3357
+ "loss": 0.552,
3358
+ "step": 4040
3359
+ },
3360
+ {
3361
+ "epoch": 5.043586550435865,
3362
+ "grad_norm": 10.3125,
3363
+ "learning_rate": 6.612222222222223e-05,
3364
+ "loss": 0.5068,
3365
+ "step": 4050
3366
+ },
3367
+ {
3368
+ "epoch": 5.056039850560398,
3369
+ "grad_norm": 9.4375,
3370
+ "learning_rate": 6.601111111111112e-05,
3371
+ "loss": 0.526,
3372
+ "step": 4060
3373
+ },
3374
+ {
3375
+ "epoch": 5.068493150684931,
3376
+ "grad_norm": 10.875,
3377
+ "learning_rate": 6.59e-05,
3378
+ "loss": 0.4818,
3379
+ "step": 4070
3380
+ },
3381
+ {
3382
+ "epoch": 5.080946450809464,
3383
+ "grad_norm": 7.75,
3384
+ "learning_rate": 6.578888888888889e-05,
3385
+ "loss": 0.491,
3386
+ "step": 4080
3387
+ },
3388
+ {
3389
+ "epoch": 5.093399750933997,
3390
+ "grad_norm": 8.6875,
3391
+ "learning_rate": 6.567777777777779e-05,
3392
+ "loss": 0.4351,
3393
+ "step": 4090
3394
+ },
3395
+ {
3396
+ "epoch": 5.1058530510585305,
3397
+ "grad_norm": 9.3125,
3398
+ "learning_rate": 6.556666666666667e-05,
3399
+ "loss": 0.5479,
3400
+ "step": 4100
3401
+ },
3402
+ {
3403
+ "epoch": 5.1058530510585305,
3404
+ "eval/acc": 34.88372039794922,
3405
+ "step": 4100
3406
+ },
3407
+ {
3408
+ "epoch": 5.1058530510585305,
3409
+ "eval_loss": 3.02447772026062,
3410
+ "eval_runtime": 6.1633,
3411
+ "eval_samples_per_second": 6.977,
3412
+ "eval_steps_per_second": 0.162,
3413
+ "step": 4100
3414
+ },
3415
+ {
3416
+ "epoch": 5.1183063511830635,
3417
+ "grad_norm": 30.0,
3418
+ "learning_rate": 6.545555555555555e-05,
3419
+ "loss": 0.556,
3420
+ "step": 4110
3421
+ },
3422
+ {
3423
+ "epoch": 5.130759651307597,
3424
+ "grad_norm": 9.5,
3425
+ "learning_rate": 6.534444444444445e-05,
3426
+ "loss": 0.5635,
3427
+ "step": 4120
3428
+ },
3429
+ {
3430
+ "epoch": 5.14321295143213,
3431
+ "grad_norm": 8.9375,
3432
+ "learning_rate": 6.523333333333334e-05,
3433
+ "loss": 0.4989,
3434
+ "step": 4130
3435
+ },
3436
+ {
3437
+ "epoch": 5.155666251556663,
3438
+ "grad_norm": 10.25,
3439
+ "learning_rate": 6.512222222222222e-05,
3440
+ "loss": 0.5918,
3441
+ "step": 4140
3442
+ },
3443
+ {
3444
+ "epoch": 5.168119551681196,
3445
+ "grad_norm": 22.875,
3446
+ "learning_rate": 6.501111111111111e-05,
3447
+ "loss": 0.521,
3448
+ "step": 4150
3449
+ },
3450
+ {
3451
+ "epoch": 5.180572851805729,
3452
+ "grad_norm": 10.75,
3453
+ "learning_rate": 6.49e-05,
3454
+ "loss": 0.452,
3455
+ "step": 4160
3456
+ },
3457
+ {
3458
+ "epoch": 5.193026151930262,
3459
+ "grad_norm": 8.0,
3460
+ "learning_rate": 6.478888888888889e-05,
3461
+ "loss": 0.4939,
3462
+ "step": 4170
3463
+ },
3464
+ {
3465
+ "epoch": 5.205479452054795,
3466
+ "grad_norm": 12.4375,
3467
+ "learning_rate": 6.467777777777779e-05,
3468
+ "loss": 0.5435,
3469
+ "step": 4180
3470
+ },
3471
+ {
3472
+ "epoch": 5.217932752179328,
3473
+ "grad_norm": 12.75,
3474
+ "learning_rate": 6.456666666666667e-05,
3475
+ "loss": 0.5814,
3476
+ "step": 4190
3477
+ },
3478
+ {
3479
+ "epoch": 5.23038605230386,
3480
+ "grad_norm": 8.75,
3481
+ "learning_rate": 6.445555555555556e-05,
3482
+ "loss": 0.5099,
3483
+ "step": 4200
3484
+ },
3485
+ {
3486
+ "epoch": 5.23038605230386,
3487
+ "eval/acc": 34.88372039794922,
3488
+ "step": 4200
3489
+ },
3490
+ {
3491
+ "epoch": 5.23038605230386,
3492
+ "eval_loss": 3.014472723007202,
3493
+ "eval_runtime": 0.8915,
3494
+ "eval_samples_per_second": 48.231,
3495
+ "eval_steps_per_second": 1.122,
3496
+ "step": 4200
3497
+ },
3498
+ {
3499
+ "epoch": 5.242839352428393,
3500
+ "grad_norm": 7.65625,
3501
+ "learning_rate": 6.434444444444446e-05,
3502
+ "loss": 0.5053,
3503
+ "step": 4210
3504
+ },
3505
+ {
3506
+ "epoch": 5.255292652552926,
3507
+ "grad_norm": 11.625,
3508
+ "learning_rate": 6.423333333333334e-05,
3509
+ "loss": 0.4819,
3510
+ "step": 4220
3511
+ },
3512
+ {
3513
+ "epoch": 5.267745952677459,
3514
+ "grad_norm": 7.3125,
3515
+ "learning_rate": 6.412222222222223e-05,
3516
+ "loss": 0.5072,
3517
+ "step": 4230
3518
+ },
3519
+ {
3520
+ "epoch": 5.280199252801992,
3521
+ "grad_norm": 10.5625,
3522
+ "learning_rate": 6.401111111111111e-05,
3523
+ "loss": 0.5305,
3524
+ "step": 4240
3525
+ },
3526
+ {
3527
+ "epoch": 5.292652552926525,
3528
+ "grad_norm": 10.25,
3529
+ "learning_rate": 6.390000000000001e-05,
3530
+ "loss": 0.4878,
3531
+ "step": 4250
3532
+ },
3533
+ {
3534
+ "epoch": 5.305105853051058,
3535
+ "grad_norm": 10.9375,
3536
+ "learning_rate": 6.378888888888889e-05,
3537
+ "loss": 0.4998,
3538
+ "step": 4260
3539
+ },
3540
+ {
3541
+ "epoch": 5.3175591531755915,
3542
+ "grad_norm": 10.0,
3543
+ "learning_rate": 6.367777777777778e-05,
3544
+ "loss": 0.5084,
3545
+ "step": 4270
3546
+ },
3547
+ {
3548
+ "epoch": 5.3300124533001245,
3549
+ "grad_norm": 12.375,
3550
+ "learning_rate": 6.356666666666668e-05,
3551
+ "loss": 0.5479,
3552
+ "step": 4280
3553
+ },
3554
+ {
3555
+ "epoch": 5.342465753424658,
3556
+ "grad_norm": 10.125,
3557
+ "learning_rate": 6.345555555555556e-05,
3558
+ "loss": 0.5765,
3559
+ "step": 4290
3560
+ },
3561
+ {
3562
+ "epoch": 5.354919053549191,
3563
+ "grad_norm": 8.375,
3564
+ "learning_rate": 6.334444444444445e-05,
3565
+ "loss": 0.5085,
3566
+ "step": 4300
3567
+ },
3568
+ {
3569
+ "epoch": 5.354919053549191,
3570
+ "eval/acc": 34.88372039794922,
3571
+ "step": 4300
3572
+ },
3573
+ {
3574
+ "epoch": 5.354919053549191,
3575
+ "eval_loss": 2.994861602783203,
3576
+ "eval_runtime": 0.2348,
3577
+ "eval_samples_per_second": 183.171,
3578
+ "eval_steps_per_second": 4.26,
3579
+ "step": 4300
3580
+ },
3581
+ {
3582
+ "epoch": 5.367372353673724,
3583
+ "grad_norm": 11.4375,
3584
+ "learning_rate": 6.323333333333333e-05,
3585
+ "loss": 0.5267,
3586
+ "step": 4310
3587
+ },
3588
+ {
3589
+ "epoch": 5.379825653798257,
3590
+ "grad_norm": 8.0625,
3591
+ "learning_rate": 6.312222222222223e-05,
3592
+ "loss": 0.5324,
3593
+ "step": 4320
3594
+ },
3595
+ {
3596
+ "epoch": 5.39227895392279,
3597
+ "grad_norm": 6.6875,
3598
+ "learning_rate": 6.301111111111111e-05,
3599
+ "loss": 0.4314,
3600
+ "step": 4330
3601
+ },
3602
+ {
3603
+ "epoch": 5.404732254047323,
3604
+ "grad_norm": 9.25,
3605
+ "learning_rate": 6.29e-05,
3606
+ "loss": 0.4408,
3607
+ "step": 4340
3608
+ },
3609
+ {
3610
+ "epoch": 5.417185554171856,
3611
+ "grad_norm": 9.875,
3612
+ "learning_rate": 6.27888888888889e-05,
3613
+ "loss": 0.4803,
3614
+ "step": 4350
3615
+ },
3616
+ {
3617
+ "epoch": 5.429638854296389,
3618
+ "grad_norm": 12.6875,
3619
+ "learning_rate": 6.267777777777778e-05,
3620
+ "loss": 0.4837,
3621
+ "step": 4360
3622
+ },
3623
+ {
3624
+ "epoch": 5.442092154420921,
3625
+ "grad_norm": 12.0625,
3626
+ "learning_rate": 6.256666666666668e-05,
3627
+ "loss": 0.5353,
3628
+ "step": 4370
3629
+ },
3630
+ {
3631
+ "epoch": 5.454545454545454,
3632
+ "grad_norm": 10.375,
3633
+ "learning_rate": 6.245555555555555e-05,
3634
+ "loss": 0.5362,
3635
+ "step": 4380
3636
+ },
3637
+ {
3638
+ "epoch": 5.466998754669987,
3639
+ "grad_norm": 8.375,
3640
+ "learning_rate": 6.234444444444445e-05,
3641
+ "loss": 0.5287,
3642
+ "step": 4390
3643
+ },
3644
+ {
3645
+ "epoch": 5.47945205479452,
3646
+ "grad_norm": 19.875,
3647
+ "learning_rate": 6.223333333333333e-05,
3648
+ "loss": 0.5403,
3649
+ "step": 4400
3650
+ },
3651
+ {
3652
+ "epoch": 5.47945205479452,
3653
+ "eval/acc": 34.88372039794922,
3654
+ "step": 4400
3655
+ },
3656
+ {
3657
+ "epoch": 5.47945205479452,
3658
+ "eval_loss": 2.997584342956543,
3659
+ "eval_runtime": 0.2299,
3660
+ "eval_samples_per_second": 187.065,
3661
+ "eval_steps_per_second": 4.35,
3662
+ "step": 4400
3663
+ },
3664
+ {
3665
+ "epoch": 5.491905354919053,
3666
+ "grad_norm": 7.8125,
3667
+ "learning_rate": 6.212222222222223e-05,
3668
+ "loss": 0.4824,
3669
+ "step": 4410
3670
+ },
3671
+ {
3672
+ "epoch": 5.504358655043586,
3673
+ "grad_norm": 8.375,
3674
+ "learning_rate": 6.20111111111111e-05,
3675
+ "loss": 0.5007,
3676
+ "step": 4420
3677
+ },
3678
+ {
3679
+ "epoch": 5.516811955168119,
3680
+ "grad_norm": 9.9375,
3681
+ "learning_rate": 6.19e-05,
3682
+ "loss": 0.5269,
3683
+ "step": 4430
3684
+ },
3685
+ {
3686
+ "epoch": 5.5292652552926524,
3687
+ "grad_norm": 8.0,
3688
+ "learning_rate": 6.17888888888889e-05,
3689
+ "loss": 0.5173,
3690
+ "step": 4440
3691
+ },
3692
+ {
3693
+ "epoch": 5.5417185554171855,
3694
+ "grad_norm": 11.1875,
3695
+ "learning_rate": 6.167777777777778e-05,
3696
+ "loss": 0.5705,
3697
+ "step": 4450
3698
+ },
3699
+ {
3700
+ "epoch": 5.5541718555417185,
3701
+ "grad_norm": 9.75,
3702
+ "learning_rate": 6.156666666666667e-05,
3703
+ "loss": 0.5471,
3704
+ "step": 4460
3705
+ },
3706
+ {
3707
+ "epoch": 5.566625155666252,
3708
+ "grad_norm": 7.65625,
3709
+ "learning_rate": 6.145555555555555e-05,
3710
+ "loss": 0.5193,
3711
+ "step": 4470
3712
+ },
3713
+ {
3714
+ "epoch": 5.579078455790785,
3715
+ "grad_norm": 9.0,
3716
+ "learning_rate": 6.134444444444445e-05,
3717
+ "loss": 0.5472,
3718
+ "step": 4480
3719
+ },
3720
+ {
3721
+ "epoch": 5.591531755915318,
3722
+ "grad_norm": 9.25,
3723
+ "learning_rate": 6.123333333333334e-05,
3724
+ "loss": 0.5345,
3725
+ "step": 4490
3726
+ },
3727
+ {
3728
+ "epoch": 5.603985056039851,
3729
+ "grad_norm": 10.5,
3730
+ "learning_rate": 6.112222222222222e-05,
3731
+ "loss": 0.5522,
3732
+ "step": 4500
3733
+ },
3734
+ {
3735
+ "epoch": 5.603985056039851,
3736
+ "eval/acc": 34.88372039794922,
3737
+ "step": 4500
3738
+ },
3739
+ {
3740
+ "epoch": 5.603985056039851,
3741
+ "eval_loss": 3.013643980026245,
3742
+ "eval_runtime": 0.2357,
3743
+ "eval_samples_per_second": 182.402,
3744
+ "eval_steps_per_second": 4.242,
3745
+ "step": 4500
3746
+ },
3747
+ {
3748
+ "epoch": 5.616438356164384,
3749
+ "grad_norm": 7.84375,
3750
+ "learning_rate": 6.101111111111112e-05,
3751
+ "loss": 0.571,
3752
+ "step": 4510
3753
+ },
3754
+ {
3755
+ "epoch": 5.628891656288917,
3756
+ "grad_norm": 8.375,
3757
+ "learning_rate": 6.09e-05,
3758
+ "loss": 0.4792,
3759
+ "step": 4520
3760
+ },
3761
+ {
3762
+ "epoch": 5.64134495641345,
3763
+ "grad_norm": 7.84375,
3764
+ "learning_rate": 6.0788888888888895e-05,
3765
+ "loss": 0.5154,
3766
+ "step": 4530
3767
+ },
3768
+ {
3769
+ "epoch": 5.653798256537982,
3770
+ "grad_norm": 8.75,
3771
+ "learning_rate": 6.067777777777778e-05,
3772
+ "loss": 0.5508,
3773
+ "step": 4540
3774
+ },
3775
+ {
3776
+ "epoch": 5.666251556662516,
3777
+ "grad_norm": 11.8125,
3778
+ "learning_rate": 6.056666666666667e-05,
3779
+ "loss": 0.5183,
3780
+ "step": 4550
3781
+ },
3782
+ {
3783
+ "epoch": 5.678704856787048,
3784
+ "grad_norm": 10.75,
3785
+ "learning_rate": 6.0455555555555555e-05,
3786
+ "loss": 0.596,
3787
+ "step": 4560
3788
+ },
3789
+ {
3790
+ "epoch": 5.691158156911581,
3791
+ "grad_norm": 9.8125,
3792
+ "learning_rate": 6.034444444444445e-05,
3793
+ "loss": 0.4746,
3794
+ "step": 4570
3795
+ },
3796
+ {
3797
+ "epoch": 5.703611457036114,
3798
+ "grad_norm": 8.9375,
3799
+ "learning_rate": 6.023333333333334e-05,
3800
+ "loss": 0.4419,
3801
+ "step": 4580
3802
+ },
3803
+ {
3804
+ "epoch": 5.716064757160647,
3805
+ "grad_norm": 9.5,
3806
+ "learning_rate": 6.012222222222222e-05,
3807
+ "loss": 0.4842,
3808
+ "step": 4590
3809
+ },
3810
+ {
3811
+ "epoch": 5.72851805728518,
3812
+ "grad_norm": 9.1875,
3813
+ "learning_rate": 6.001111111111112e-05,
3814
+ "loss": 0.5282,
3815
+ "step": 4600
3816
+ },
3817
+ {
3818
+ "epoch": 5.72851805728518,
3819
+ "eval/acc": 34.88372039794922,
3820
+ "step": 4600
3821
+ },
3822
+ {
3823
+ "epoch": 5.72851805728518,
3824
+ "eval_loss": 3.12896466255188,
3825
+ "eval_runtime": 0.224,
3826
+ "eval_samples_per_second": 191.949,
3827
+ "eval_steps_per_second": 4.464,
3828
+ "step": 4600
3829
+ },
3830
+ {
3831
+ "epoch": 5.740971357409713,
3832
+ "grad_norm": 10.0625,
3833
+ "learning_rate": 5.99e-05,
3834
+ "loss": 0.5623,
3835
+ "step": 4610
3836
+ },
3837
+ {
3838
+ "epoch": 5.7534246575342465,
3839
+ "grad_norm": 10.5625,
3840
+ "learning_rate": 5.97888888888889e-05,
3841
+ "loss": 0.5931,
3842
+ "step": 4620
3843
+ },
3844
+ {
3845
+ "epoch": 5.7658779576587795,
3846
+ "grad_norm": 11.5,
3847
+ "learning_rate": 5.9677777777777775e-05,
3848
+ "loss": 0.5964,
3849
+ "step": 4630
3850
+ },
3851
+ {
3852
+ "epoch": 5.778331257783313,
3853
+ "grad_norm": 9.5625,
3854
+ "learning_rate": 5.9566666666666673e-05,
3855
+ "loss": 0.5159,
3856
+ "step": 4640
3857
+ },
3858
+ {
3859
+ "epoch": 5.790784557907846,
3860
+ "grad_norm": 9.75,
3861
+ "learning_rate": 5.945555555555555e-05,
3862
+ "loss": 0.4576,
3863
+ "step": 4650
3864
+ },
3865
+ {
3866
+ "epoch": 5.803237858032379,
3867
+ "grad_norm": 11.9375,
3868
+ "learning_rate": 5.934444444444445e-05,
3869
+ "loss": 0.4963,
3870
+ "step": 4660
3871
+ },
3872
+ {
3873
+ "epoch": 5.815691158156912,
3874
+ "grad_norm": 8.8125,
3875
+ "learning_rate": 5.923333333333334e-05,
3876
+ "loss": 0.4869,
3877
+ "step": 4670
3878
+ },
3879
+ {
3880
+ "epoch": 5.828144458281445,
3881
+ "grad_norm": 9.3125,
3882
+ "learning_rate": 5.9122222222222226e-05,
3883
+ "loss": 0.4578,
3884
+ "step": 4680
3885
+ },
3886
+ {
3887
+ "epoch": 5.840597758405978,
3888
+ "grad_norm": 8.5,
3889
+ "learning_rate": 5.901111111111112e-05,
3890
+ "loss": 0.491,
3891
+ "step": 4690
3892
+ },
3893
+ {
3894
+ "epoch": 5.853051058530511,
3895
+ "grad_norm": 8.0,
3896
+ "learning_rate": 5.89e-05,
3897
+ "loss": 0.5223,
3898
+ "step": 4700
3899
+ },
3900
+ {
3901
+ "epoch": 5.853051058530511,
3902
+ "eval/acc": 34.88372039794922,
3903
+ "step": 4700
3904
+ },
3905
+ {
3906
+ "epoch": 5.853051058530511,
3907
+ "eval_loss": 3.08896541595459,
3908
+ "eval_runtime": 0.2255,
3909
+ "eval_samples_per_second": 190.673,
3910
+ "eval_steps_per_second": 4.434,
3911
+ "step": 4700
3912
+ },
3913
+ {
3914
+ "epoch": 5.865504358655044,
3915
+ "grad_norm": 9.125,
3916
+ "learning_rate": 5.878888888888889e-05,
3917
+ "loss": 0.4916,
3918
+ "step": 4710
3919
+ },
3920
+ {
3921
+ "epoch": 5.877957658779577,
3922
+ "grad_norm": 11.625,
3923
+ "learning_rate": 5.867777777777778e-05,
3924
+ "loss": 0.5485,
3925
+ "step": 4720
3926
+ },
3927
+ {
3928
+ "epoch": 5.890410958904109,
3929
+ "grad_norm": 8.9375,
3930
+ "learning_rate": 5.856666666666667e-05,
3931
+ "loss": 0.4782,
3932
+ "step": 4730
3933
+ },
3934
+ {
3935
+ "epoch": 5.902864259028642,
3936
+ "grad_norm": 8.5,
3937
+ "learning_rate": 5.845555555555556e-05,
3938
+ "loss": 0.4756,
3939
+ "step": 4740
3940
+ },
3941
+ {
3942
+ "epoch": 5.915317559153175,
3943
+ "grad_norm": 8.5,
3944
+ "learning_rate": 5.8344444444444446e-05,
3945
+ "loss": 0.4848,
3946
+ "step": 4750
3947
+ },
3948
+ {
3949
+ "epoch": 5.927770859277708,
3950
+ "grad_norm": 10.3125,
3951
+ "learning_rate": 5.823333333333334e-05,
3952
+ "loss": 0.5772,
3953
+ "step": 4760
3954
+ },
3955
+ {
3956
+ "epoch": 5.940224159402241,
3957
+ "grad_norm": 10.3125,
3958
+ "learning_rate": 5.812222222222222e-05,
3959
+ "loss": 0.4972,
3960
+ "step": 4770
3961
+ },
3962
+ {
3963
+ "epoch": 5.952677459526774,
3964
+ "grad_norm": 13.125,
3965
+ "learning_rate": 5.801111111111111e-05,
3966
+ "loss": 0.5185,
3967
+ "step": 4780
3968
+ },
3969
+ {
3970
+ "epoch": 5.9651307596513075,
3971
+ "grad_norm": 10.25,
3972
+ "learning_rate": 5.79e-05,
3973
+ "loss": 0.4956,
3974
+ "step": 4790
3975
+ },
3976
+ {
3977
+ "epoch": 5.9775840597758405,
3978
+ "grad_norm": 7.59375,
3979
+ "learning_rate": 5.778888888888889e-05,
3980
+ "loss": 0.5193,
3981
+ "step": 4800
3982
+ },
3983
+ {
3984
+ "epoch": 5.9775840597758405,
3985
+ "eval/acc": 34.88372039794922,
3986
+ "step": 4800
3987
+ },
3988
+ {
3989
+ "epoch": 5.9775840597758405,
3990
+ "eval_loss": 3.0848581790924072,
3991
+ "eval_runtime": 0.2247,
3992
+ "eval_samples_per_second": 191.396,
3993
+ "eval_steps_per_second": 4.451,
3994
+ "step": 4800
3995
+ },
3996
+ {
3997
+ "epoch": 5.990037359900374,
3998
+ "grad_norm": 10.1875,
3999
+ "learning_rate": 5.7677777777777774e-05,
4000
+ "loss": 0.4816,
4001
+ "step": 4810
4002
+ },
4003
+ {
4004
+ "epoch": 6.002490660024907,
4005
+ "grad_norm": 8.5,
4006
+ "learning_rate": 5.7566666666666666e-05,
4007
+ "loss": 0.512,
4008
+ "step": 4820
4009
+ },
4010
+ {
4011
+ "epoch": 6.01494396014944,
4012
+ "grad_norm": 10.75,
4013
+ "learning_rate": 5.7455555555555564e-05,
4014
+ "loss": 0.536,
4015
+ "step": 4830
4016
+ },
4017
+ {
4018
+ "epoch": 6.027397260273973,
4019
+ "grad_norm": 10.8125,
4020
+ "learning_rate": 5.734444444444445e-05,
4021
+ "loss": 0.4573,
4022
+ "step": 4840
4023
+ },
4024
+ {
4025
+ "epoch": 6.039850560398506,
4026
+ "grad_norm": 10.1875,
4027
+ "learning_rate": 5.723333333333334e-05,
4028
+ "loss": 0.5556,
4029
+ "step": 4850
4030
+ },
4031
+ {
4032
+ "epoch": 6.052303860523039,
4033
+ "grad_norm": 13.6875,
4034
+ "learning_rate": 5.7122222222222225e-05,
4035
+ "loss": 0.4513,
4036
+ "step": 4860
4037
+ },
4038
+ {
4039
+ "epoch": 6.064757160647572,
4040
+ "grad_norm": 11.0,
4041
+ "learning_rate": 5.7011111111111116e-05,
4042
+ "loss": 0.4453,
4043
+ "step": 4870
4044
+ },
4045
+ {
4046
+ "epoch": 6.077210460772105,
4047
+ "grad_norm": 11.3125,
4048
+ "learning_rate": 5.69e-05,
4049
+ "loss": 0.5147,
4050
+ "step": 4880
4051
+ },
4052
+ {
4053
+ "epoch": 6.089663760896638,
4054
+ "grad_norm": 13.125,
4055
+ "learning_rate": 5.678888888888889e-05,
4056
+ "loss": 0.4948,
4057
+ "step": 4890
4058
+ },
4059
+ {
4060
+ "epoch": 6.102117061021171,
4061
+ "grad_norm": 11.4375,
4062
+ "learning_rate": 5.6677777777777784e-05,
4063
+ "loss": 0.5324,
4064
+ "step": 4900
4065
+ },
4066
+ {
4067
+ "epoch": 6.102117061021171,
4068
+ "eval/acc": 34.88372039794922,
4069
+ "step": 4900
4070
+ },
4071
+ {
4072
+ "epoch": 6.102117061021171,
4073
+ "eval_loss": 2.459988832473755,
4074
+ "eval_runtime": 6.3869,
4075
+ "eval_samples_per_second": 6.733,
4076
+ "eval_steps_per_second": 0.157,
4077
+ "step": 4900
4078
+ },
4079
+ {
4080
+ "epoch": 6.114570361145703,
4081
+ "grad_norm": 10.6875,
4082
+ "learning_rate": 5.656666666666667e-05,
4083
+ "loss": 0.4818,
4084
+ "step": 4910
4085
+ },
4086
+ {
4087
+ "epoch": 6.127023661270236,
4088
+ "grad_norm": 10.3125,
4089
+ "learning_rate": 5.645555555555556e-05,
4090
+ "loss": 0.5784,
4091
+ "step": 4920
4092
+ },
4093
+ {
4094
+ "epoch": 6.139476961394769,
4095
+ "grad_norm": 13.4375,
4096
+ "learning_rate": 5.6344444444444444e-05,
4097
+ "loss": 0.3966,
4098
+ "step": 4930
4099
+ },
4100
+ {
4101
+ "epoch": 6.151930261519302,
4102
+ "grad_norm": 10.375,
4103
+ "learning_rate": 5.6233333333333336e-05,
4104
+ "loss": 0.4861,
4105
+ "step": 4940
4106
+ },
4107
+ {
4108
+ "epoch": 6.164383561643835,
4109
+ "grad_norm": 10.375,
4110
+ "learning_rate": 5.612222222222222e-05,
4111
+ "loss": 0.4646,
4112
+ "step": 4950
4113
+ },
4114
+ {
4115
+ "epoch": 6.176836861768368,
4116
+ "grad_norm": 10.1875,
4117
+ "learning_rate": 5.601111111111111e-05,
4118
+ "loss": 0.5416,
4119
+ "step": 4960
4120
+ },
4121
+ {
4122
+ "epoch": 6.1892901618929015,
4123
+ "grad_norm": 10.0625,
4124
+ "learning_rate": 5.590000000000001e-05,
4125
+ "loss": 0.4985,
4126
+ "step": 4970
4127
+ },
4128
+ {
4129
+ "epoch": 6.2017434620174345,
4130
+ "grad_norm": 10.125,
4131
+ "learning_rate": 5.578888888888889e-05,
4132
+ "loss": 0.4865,
4133
+ "step": 4980
4134
+ },
4135
+ {
4136
+ "epoch": 6.214196762141968,
4137
+ "grad_norm": 10.625,
4138
+ "learning_rate": 5.5677777777777786e-05,
4139
+ "loss": 0.4818,
4140
+ "step": 4990
4141
+ },
4142
+ {
4143
+ "epoch": 6.226650062266501,
4144
+ "grad_norm": 10.75,
4145
+ "learning_rate": 5.5566666666666664e-05,
4146
+ "loss": 0.4909,
4147
+ "step": 5000
4148
+ },
4149
+ {
4150
+ "epoch": 6.226650062266501,
4151
+ "eval/acc": 34.88372039794922,
4152
+ "step": 5000
4153
+ },
4154
+ {
4155
+ "epoch": 6.226650062266501,
4156
+ "eval_loss": 2.4926505088806152,
4157
+ "eval_runtime": 0.2216,
4158
+ "eval_samples_per_second": 194.068,
4159
+ "eval_steps_per_second": 4.513,
4160
+ "step": 5000
4161
+ }
4162
+ ],
4163
+ "logging_steps": 10,
4164
+ "max_steps": 10000,
4165
+ "num_input_tokens_seen": 0,
4166
+ "num_train_epochs": 13,
4167
+ "save_steps": 2500,
4168
+ "stateful_callbacks": {
4169
+ "TrainerControl": {
4170
+ "args": {
4171
+ "should_epoch_stop": false,
4172
+ "should_evaluate": false,
4173
+ "should_log": false,
4174
+ "should_save": true,
4175
+ "should_training_stop": false
4176
+ },
4177
+ "attributes": {}
4178
+ }
4179
+ },
4180
+ "total_flos": 0.0,
4181
+ "train_batch_size": 16,
4182
+ "trial_name": null,
4183
+ "trial_params": null
4184
+ }
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b297d5e64c898db22dbb0b2c7feb17b604dfbcc3bfebf55e9e7ecbf9c3794c
3
+ size 6161
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aecd4553a1d58ca11b17030014708270f38be0c51df4dcd752bb2e3ddc8dd81
3
+ size 298041696
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d48bd01953728d4eac963af505f6d4253af613156cb9e6895810b87c6d7b7524
3
+ size 596170443
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ef8cbbdda5cb37ab7cf20f5c005619f2776e3f4face3617c4680d91d8a07ece
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0be06a52a1be8ec61cf963589563ad06d498c8986f95cc6e2cf6dd2628b95f36
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8de3a32ca6dbe3f13cbc720107ecdd6dbaaca93202f28d4cd937551ab5665d8b
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cadd49afaa5e5d92514cc4b811a3968236ab79a1466a7061413f550c41026201
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:773947218d8d7737c8670043d737e80a30fe17375af8e46749692f7803f2df3b
3
+ size 1465
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b297d5e64c898db22dbb0b2c7feb17b604dfbcc3bfebf55e9e7ecbf9c3794c
3
+ size 6161
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa648019534a54da16e5df11fb28257398ac4ee886de2d2ef90e587b14a698f7
3
+ size 298041696
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a47b56f6d386053d17cfa1c908b16dcebcec2fa8dbf6ea679e0add277be30b3
3
+ size 596170443
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c235c10397ca3fb3b82475883c48d3bb786206feaee53c2199c913179faf1fb
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:937bfac24cd2fe886a72cb180e9d726f8629acaf1e31b2beab1f7a03381ca0ca
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee0687693332dd9f28a675c2a9f27590ae650095d80dac61354fce4437e7f9de
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffb4dab4ba8c60d5f5c48a1048c1ecc4e949aff462fd8340d7ad1a380fc12fdd
3
+ size 15429
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/runs/Nov25_13-58-36_nid005118/events.out.tfevents.1764072154.nid005118.9241.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a00311704ec9f450f3bbd2a82bb88a4f98ffb9756b557103426b1bbd5b1571c
3
+ size 254610
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d02be6d8bda4ea9c67040ed89f878acdc986bd4df3fbc60440a9d3eacca02d63
3
+ size 1465