Commit
•
2351dee
1
Parent(s):
89a42ef
Training in progress, step 2500
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- model.safetensors +1 -1
- run-0/checkpoint-1000/model.safetensors +1 -1
- run-0/checkpoint-1000/optimizer.pt +1 -1
- run-0/checkpoint-1000/scheduler.pt +1 -1
- run-0/checkpoint-1000/trainer_state.json +27 -27
- run-0/checkpoint-1000/training_args.bin +1 -1
- run-0/checkpoint-1500/model.safetensors +1 -1
- run-0/checkpoint-1500/optimizer.pt +1 -1
- run-0/checkpoint-1500/scheduler.pt +1 -1
- run-0/checkpoint-1500/trainer_state.json +35 -35
- run-0/checkpoint-1500/training_args.bin +1 -1
- run-0/checkpoint-500/model.safetensors +1 -1
- run-0/checkpoint-500/optimizer.pt +1 -1
- run-0/checkpoint-500/scheduler.pt +1 -1
- run-0/checkpoint-500/trainer_state.json +14 -14
- run-0/checkpoint-500/training_args.bin +1 -1
- run-1/checkpoint-1000/model.safetensors +1 -1
- run-1/checkpoint-1000/optimizer.pt +1 -1
- run-1/checkpoint-1000/scheduler.pt +1 -1
- run-1/checkpoint-1000/trainer_state.json +26 -26
- run-1/checkpoint-1000/training_args.bin +1 -1
- run-1/checkpoint-1500/model.safetensors +1 -1
- run-1/checkpoint-1500/optimizer.pt +1 -1
- run-1/checkpoint-1500/scheduler.pt +1 -1
- run-1/checkpoint-1500/trainer_state.json +34 -34
- run-1/checkpoint-1500/training_args.bin +1 -1
- run-1/checkpoint-2000/model.safetensors +1 -1
- run-1/checkpoint-2000/optimizer.pt +1 -1
- run-1/checkpoint-2000/scheduler.pt +1 -1
- run-1/checkpoint-2000/trainer_state.json +47 -47
- run-1/checkpoint-2000/training_args.bin +1 -1
- run-1/checkpoint-500/model.safetensors +1 -1
- run-1/checkpoint-500/optimizer.pt +1 -1
- run-1/checkpoint-500/scheduler.pt +1 -1
- run-1/checkpoint-500/trainer_state.json +13 -13
- run-1/checkpoint-500/training_args.bin +1 -1
- run-2/checkpoint-1000/model.safetensors +1 -1
- run-2/checkpoint-1000/optimizer.pt +1 -1
- run-2/checkpoint-1000/scheduler.pt +1 -1
- run-2/checkpoint-1000/trainer_state.json +26 -26
- run-2/checkpoint-1000/training_args.bin +1 -1
- run-2/checkpoint-1500/model.safetensors +1 -1
- run-2/checkpoint-1500/optimizer.pt +1 -1
- run-2/checkpoint-1500/scheduler.pt +1 -1
- run-2/checkpoint-1500/trainer_state.json +34 -34
- run-2/checkpoint-1500/training_args.bin +1 -1
- run-2/checkpoint-2000/model.safetensors +1 -1
- run-2/checkpoint-2000/optimizer.pt +1 -1
- run-2/checkpoint-2000/scheduler.pt +1 -1
- run-2/checkpoint-2000/trainer_state.json +47 -47
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78016a20790cea0a2ba4e424321e0f9b1c92dba6f42f3058778c503ac0b8e500
|
3 |
size 268290900
|
run-0/checkpoint-1000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcdfe9a6c39ec2298ff882406bfc249659761f4552c8ab3392cfd68269c4ee22
|
3 |
size 268290900
|
run-0/checkpoint-1000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:37e2f2b450d6975ded453dd9ba20a24ede194f9df45c36ed9c5a4048276a5e29
|
3 |
size 536643898
|
run-0/checkpoint-1000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d435f08c91fbb25ecefa6816f8694ea265b07f7526f27bca41c28b9ad50fad06
|
3 |
size 1064
|
run-0/checkpoint-1000/trainer_state.json
CHANGED
@@ -10,57 +10,57 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime": 14.
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second": 26.
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
-
"eval_accuracy": 0.
|
30 |
-
"eval_loss": 0.
|
31 |
-
"eval_runtime":
|
32 |
-
"eval_samples_per_second":
|
33 |
-
"eval_steps_per_second":
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
-
"eval_accuracy": 0.
|
39 |
-
"eval_loss": 0.
|
40 |
-
"eval_runtime":
|
41 |
-
"eval_samples_per_second":
|
42 |
-
"eval_steps_per_second":
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
-
"grad_norm": 0.
|
48 |
-
"learning_rate":
|
49 |
-
"loss": 0.
|
50 |
"step": 1000
|
51 |
}
|
52 |
],
|
53 |
"logging_steps": 500,
|
54 |
-
"max_steps":
|
55 |
"num_input_tokens_seen": 0,
|
56 |
-
"num_train_epochs":
|
57 |
"save_steps": 500,
|
58 |
-
"total_flos":
|
59 |
"train_batch_size": 48,
|
60 |
"trial_name": null,
|
61 |
"trial_params": {
|
62 |
-
"alpha": 0.
|
63 |
-
"num_train_epochs":
|
64 |
-
"temperature":
|
65 |
}
|
66 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5893548387096774,
|
14 |
+
"eval_loss": 0.21903084218502045,
|
15 |
+
"eval_runtime": 14.3987,
|
16 |
+
"eval_samples_per_second": 215.297,
|
17 |
+
"eval_steps_per_second": 26.947,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5576276779174805,
|
23 |
+
"learning_rate": 1.4758909853249476e-05,
|
24 |
+
"loss": 0.3448,
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
+
"eval_accuracy": 0.8183870967741935,
|
30 |
+
"eval_loss": 0.10806020349264145,
|
31 |
+
"eval_runtime": 15.1708,
|
32 |
+
"eval_samples_per_second": 204.339,
|
33 |
+
"eval_steps_per_second": 25.575,
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
+
"eval_accuracy": 0.8777419354838709,
|
39 |
+
"eval_loss": 0.07312986254692078,
|
40 |
+
"eval_runtime": 15.1972,
|
41 |
+
"eval_samples_per_second": 203.985,
|
42 |
+
"eval_steps_per_second": 25.531,
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
+
"grad_norm": 0.5403371453285217,
|
48 |
+
"learning_rate": 9.517819706498952e-06,
|
49 |
+
"loss": 0.1247,
|
50 |
"step": 1000
|
51 |
}
|
52 |
],
|
53 |
"logging_steps": 500,
|
54 |
+
"max_steps": 1908,
|
55 |
"num_input_tokens_seen": 0,
|
56 |
+
"num_train_epochs": 6,
|
57 |
"save_steps": 500,
|
58 |
+
"total_flos": 308320501960968.0,
|
59 |
"train_batch_size": 48,
|
60 |
"trial_name": null,
|
61 |
"trial_params": {
|
62 |
+
"alpha": 0.04291496094703673,
|
63 |
+
"num_train_epochs": 6,
|
64 |
+
"temperature": 7
|
65 |
}
|
66 |
}
|
run-0/checkpoint-1000/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6d301d3792da68a32e564cb44e96306a109e871eaaeebe23fb207a1f20fef33
|
3 |
size 5048
|
run-0/checkpoint-1500/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1e69f0703aa726ed38410f75abe7aa65b71338a02f00cd30eb48e9e28ae82cc
|
3 |
size 268290900
|
run-0/checkpoint-1500/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6de24fd3dc67d7467a96bfbf4a84f84db15a52ecbb6a6b663db5eb95e5a246fe
|
3 |
size 536643898
|
run-0/checkpoint-1500/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d0d96760a1cdcbc417a57dd4ff944b6ece136ebbdfecf57b1e511053d5ab0b
|
3 |
size 1064
|
run-0/checkpoint-1500/trainer_state.json
CHANGED
@@ -10,73 +10,73 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime": 14.
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second": 26.
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
-
"eval_accuracy": 0.
|
30 |
-
"eval_loss": 0.
|
31 |
-
"eval_runtime":
|
32 |
-
"eval_samples_per_second":
|
33 |
-
"eval_steps_per_second":
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
-
"eval_accuracy": 0.
|
39 |
-
"eval_loss": 0.
|
40 |
-
"eval_runtime":
|
41 |
-
"eval_samples_per_second":
|
42 |
-
"eval_steps_per_second":
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
-
"grad_norm": 0.
|
48 |
-
"learning_rate":
|
49 |
-
"loss": 0.
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
-
"eval_accuracy": 0.
|
55 |
-
"eval_loss": 0.
|
56 |
-
"eval_runtime":
|
57 |
-
"eval_samples_per_second":
|
58 |
-
"eval_steps_per_second":
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
-
"grad_norm": 0.
|
64 |
-
"learning_rate":
|
65 |
-
"loss": 0.
|
66 |
"step": 1500
|
67 |
}
|
68 |
],
|
69 |
"logging_steps": 500,
|
70 |
-
"max_steps":
|
71 |
"num_input_tokens_seen": 0,
|
72 |
-
"num_train_epochs":
|
73 |
"save_steps": 500,
|
74 |
-
"total_flos":
|
75 |
"train_batch_size": 48,
|
76 |
"trial_name": null,
|
77 |
"trial_params": {
|
78 |
-
"alpha": 0.
|
79 |
-
"num_train_epochs":
|
80 |
-
"temperature":
|
81 |
}
|
82 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5893548387096774,
|
14 |
+
"eval_loss": 0.21903084218502045,
|
15 |
+
"eval_runtime": 14.3987,
|
16 |
+
"eval_samples_per_second": 215.297,
|
17 |
+
"eval_steps_per_second": 26.947,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5576276779174805,
|
23 |
+
"learning_rate": 1.4758909853249476e-05,
|
24 |
+
"loss": 0.3448,
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
+
"eval_accuracy": 0.8183870967741935,
|
30 |
+
"eval_loss": 0.10806020349264145,
|
31 |
+
"eval_runtime": 15.1708,
|
32 |
+
"eval_samples_per_second": 204.339,
|
33 |
+
"eval_steps_per_second": 25.575,
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
+
"eval_accuracy": 0.8777419354838709,
|
39 |
+
"eval_loss": 0.07312986254692078,
|
40 |
+
"eval_runtime": 15.1972,
|
41 |
+
"eval_samples_per_second": 203.985,
|
42 |
+
"eval_steps_per_second": 25.531,
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
+
"grad_norm": 0.5403371453285217,
|
48 |
+
"learning_rate": 9.517819706498952e-06,
|
49 |
+
"loss": 0.1247,
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
+
"eval_accuracy": 0.8941935483870967,
|
55 |
+
"eval_loss": 0.05839391052722931,
|
56 |
+
"eval_runtime": 15.6617,
|
57 |
+
"eval_samples_per_second": 197.935,
|
58 |
+
"eval_steps_per_second": 24.774,
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
+
"grad_norm": 0.4541510045528412,
|
64 |
+
"learning_rate": 4.276729559748428e-06,
|
65 |
+
"loss": 0.0836,
|
66 |
"step": 1500
|
67 |
}
|
68 |
],
|
69 |
"logging_steps": 500,
|
70 |
+
"max_steps": 1908,
|
71 |
"num_input_tokens_seen": 0,
|
72 |
+
"num_train_epochs": 6,
|
73 |
"save_steps": 500,
|
74 |
+
"total_flos": 438748106750688.0,
|
75 |
"train_batch_size": 48,
|
76 |
"trial_name": null,
|
77 |
"trial_params": {
|
78 |
+
"alpha": 0.04291496094703673,
|
79 |
+
"num_train_epochs": 6,
|
80 |
+
"temperature": 7
|
81 |
}
|
82 |
}
|
run-0/checkpoint-1500/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6d301d3792da68a32e564cb44e96306a109e871eaaeebe23fb207a1f20fef33
|
3 |
size 5048
|
run-0/checkpoint-500/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28c546392df65d1dbaa8041f493b566777db4ef4ec5235ed2f1dfa2724cc15c9
|
3 |
size 268290900
|
run-0/checkpoint-500/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90c2485ca187e7693a3dd9cb86988e00019c2099b440b0a466a0c63d4b195d87
|
3 |
size 536643898
|
run-0/checkpoint-500/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40a61c60e2a1f354f14fb973803108ea16a9c8c66946147c407c26a5211c4f3d
|
3 |
size 1064
|
run-0/checkpoint-500/trainer_state.json
CHANGED
@@ -10,32 +10,32 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime": 14.
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second": 26.
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
}
|
27 |
],
|
28 |
"logging_steps": 500,
|
29 |
-
"max_steps":
|
30 |
"num_input_tokens_seen": 0,
|
31 |
-
"num_train_epochs":
|
32 |
"save_steps": 500,
|
33 |
-
"total_flos":
|
34 |
"train_batch_size": 48,
|
35 |
"trial_name": null,
|
36 |
"trial_params": {
|
37 |
-
"alpha": 0.
|
38 |
-
"num_train_epochs":
|
39 |
-
"temperature":
|
40 |
}
|
41 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5893548387096774,
|
14 |
+
"eval_loss": 0.21903084218502045,
|
15 |
+
"eval_runtime": 14.3987,
|
16 |
+
"eval_samples_per_second": 215.297,
|
17 |
+
"eval_steps_per_second": 26.947,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5576276779174805,
|
23 |
+
"learning_rate": 1.4758909853249476e-05,
|
24 |
+
"loss": 0.3448,
|
25 |
"step": 500
|
26 |
}
|
27 |
],
|
28 |
"logging_steps": 500,
|
29 |
+
"max_steps": 1908,
|
30 |
"num_input_tokens_seen": 0,
|
31 |
+
"num_train_epochs": 6,
|
32 |
"save_steps": 500,
|
33 |
+
"total_flos": 177451376459700.0,
|
34 |
"train_batch_size": 48,
|
35 |
"trial_name": null,
|
36 |
"trial_params": {
|
37 |
+
"alpha": 0.04291496094703673,
|
38 |
+
"num_train_epochs": 6,
|
39 |
+
"temperature": 7
|
40 |
}
|
41 |
}
|
run-0/checkpoint-500/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6d301d3792da68a32e564cb44e96306a109e871eaaeebe23fb207a1f20fef33
|
3 |
size 5048
|
run-1/checkpoint-1000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df0c1e7f0c76c2d33cfb8471753b518ac63cee9c321b2f9ff003512f3569f5dd
|
3 |
size 268290900
|
run-1/checkpoint-1000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29f4f538b0136a3857eeec3f711e1e7b22fb6659feb942992c6f42aae30b096a
|
3 |
size 536643898
|
run-1/checkpoint-1000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:690dc36074a76886ef61aa14bba7f4a22546d075686d785f211a5a8037fc50f8
|
3 |
size 1064
|
run-1/checkpoint-1000/trainer_state.json
CHANGED
@@ -10,57 +10,57 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime":
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second":
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
-
"eval_accuracy": 0.
|
30 |
-
"eval_loss": 0.
|
31 |
-
"eval_runtime":
|
32 |
-
"eval_samples_per_second":
|
33 |
-
"eval_steps_per_second":
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
-
"eval_accuracy": 0.
|
39 |
-
"eval_loss": 0.
|
40 |
-
"eval_runtime": 14.
|
41 |
-
"eval_samples_per_second": 209.
|
42 |
-
"eval_steps_per_second": 26.
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
-
"grad_norm": 0.
|
48 |
-
"learning_rate": 1.
|
49 |
-
"loss": 0.
|
50 |
"step": 1000
|
51 |
}
|
52 |
],
|
53 |
"logging_steps": 500,
|
54 |
-
"max_steps":
|
55 |
"num_input_tokens_seen": 0,
|
56 |
-
"num_train_epochs":
|
57 |
"save_steps": 500,
|
58 |
"total_flos": 260941334653608.0,
|
59 |
"train_batch_size": 48,
|
60 |
"trial_name": null,
|
61 |
"trial_params": {
|
62 |
-
"alpha": 0.
|
63 |
-
"num_train_epochs":
|
64 |
-
"temperature":
|
65 |
}
|
66 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5832258064516129,
|
14 |
+
"eval_loss": 0.20071792602539062,
|
15 |
+
"eval_runtime": 13.9879,
|
16 |
+
"eval_samples_per_second": 221.619,
|
17 |
+
"eval_steps_per_second": 27.738,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5363588929176331,
|
23 |
+
"learning_rate": 1.550763701707098e-05,
|
24 |
+
"loss": 0.319,
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
+
"eval_accuracy": 0.817741935483871,
|
30 |
+
"eval_loss": 0.0997093915939331,
|
31 |
+
"eval_runtime": 14.8543,
|
32 |
+
"eval_samples_per_second": 208.694,
|
33 |
+
"eval_steps_per_second": 26.12,
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
+
"eval_accuracy": 0.8796774193548387,
|
39 |
+
"eval_loss": 0.06741480529308319,
|
40 |
+
"eval_runtime": 14.8004,
|
41 |
+
"eval_samples_per_second": 209.454,
|
42 |
+
"eval_steps_per_second": 26.216,
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
+
"grad_norm": 0.5110210180282593,
|
48 |
+
"learning_rate": 1.101527403414196e-05,
|
49 |
+
"loss": 0.1149,
|
50 |
"step": 1000
|
51 |
}
|
52 |
],
|
53 |
"logging_steps": 500,
|
54 |
+
"max_steps": 2226,
|
55 |
"num_input_tokens_seen": 0,
|
56 |
+
"num_train_epochs": 7,
|
57 |
"save_steps": 500,
|
58 |
"total_flos": 260941334653608.0,
|
59 |
"train_batch_size": 48,
|
60 |
"trial_name": null,
|
61 |
"trial_params": {
|
62 |
+
"alpha": 0.45847029900054825,
|
63 |
+
"num_train_epochs": 7,
|
64 |
+
"temperature": 12
|
65 |
}
|
66 |
}
|
run-1/checkpoint-1000/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6c1680041c2ad9abd833a14af03572062f4c25ba5058b84c7ced516363495d8
|
3 |
size 5048
|
run-1/checkpoint-1500/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5fc350ced5969ce561b3410d20de09cf7562da401b84b0ed0134b1437acbfb7d
|
3 |
size 268290900
|
run-1/checkpoint-1500/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:324c9fb6b2e3ee46b5b1a32e37040174373891985432d194f761d51fc5fefa27
|
3 |
size 536643898
|
run-1/checkpoint-1500/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10247f8a1efaf9e3e35daf770b64f17a24e4101d4e010d50d94597b8e48a5f16
|
3 |
size 1064
|
run-1/checkpoint-1500/trainer_state.json
CHANGED
@@ -10,73 +10,73 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime":
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second":
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
-
"eval_accuracy": 0.
|
30 |
-
"eval_loss": 0.
|
31 |
-
"eval_runtime":
|
32 |
-
"eval_samples_per_second":
|
33 |
-
"eval_steps_per_second":
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
-
"eval_accuracy": 0.
|
39 |
-
"eval_loss": 0.
|
40 |
-
"eval_runtime": 14.
|
41 |
-
"eval_samples_per_second": 209.
|
42 |
-
"eval_steps_per_second": 26.
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
-
"grad_norm": 0.
|
48 |
-
"learning_rate": 1.
|
49 |
-
"loss": 0.
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
-
"eval_accuracy": 0.
|
55 |
-
"eval_loss": 0.
|
56 |
-
"eval_runtime": 15.
|
57 |
-
"eval_samples_per_second":
|
58 |
-
"eval_steps_per_second":
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
-
"grad_norm": 0.
|
64 |
-
"learning_rate":
|
65 |
-
"loss": 0.
|
66 |
"step": 1500
|
67 |
}
|
68 |
],
|
69 |
"logging_steps": 500,
|
70 |
-
"max_steps":
|
71 |
"num_input_tokens_seen": 0,
|
72 |
-
"num_train_epochs":
|
73 |
"save_steps": 500,
|
74 |
"total_flos": 391368939443328.0,
|
75 |
"train_batch_size": 48,
|
76 |
"trial_name": null,
|
77 |
"trial_params": {
|
78 |
-
"alpha": 0.
|
79 |
-
"num_train_epochs":
|
80 |
-
"temperature":
|
81 |
}
|
82 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5832258064516129,
|
14 |
+
"eval_loss": 0.20071792602539062,
|
15 |
+
"eval_runtime": 13.9879,
|
16 |
+
"eval_samples_per_second": 221.619,
|
17 |
+
"eval_steps_per_second": 27.738,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5363588929176331,
|
23 |
+
"learning_rate": 1.550763701707098e-05,
|
24 |
+
"loss": 0.319,
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
+
"eval_accuracy": 0.817741935483871,
|
30 |
+
"eval_loss": 0.0997093915939331,
|
31 |
+
"eval_runtime": 14.8543,
|
32 |
+
"eval_samples_per_second": 208.694,
|
33 |
+
"eval_steps_per_second": 26.12,
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
+
"eval_accuracy": 0.8796774193548387,
|
39 |
+
"eval_loss": 0.06741480529308319,
|
40 |
+
"eval_runtime": 14.8004,
|
41 |
+
"eval_samples_per_second": 209.454,
|
42 |
+
"eval_steps_per_second": 26.216,
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
+
"grad_norm": 0.5110210180282593,
|
48 |
+
"learning_rate": 1.101527403414196e-05,
|
49 |
+
"loss": 0.1149,
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
+
"eval_accuracy": 0.8958064516129032,
|
55 |
+
"eval_loss": 0.05332249775528908,
|
56 |
+
"eval_runtime": 15.4008,
|
57 |
+
"eval_samples_per_second": 201.288,
|
58 |
+
"eval_steps_per_second": 25.193,
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
+
"grad_norm": 0.4394994378089905,
|
64 |
+
"learning_rate": 6.522911051212939e-06,
|
65 |
+
"loss": 0.076,
|
66 |
"step": 1500
|
67 |
}
|
68 |
],
|
69 |
"logging_steps": 500,
|
70 |
+
"max_steps": 2226,
|
71 |
"num_input_tokens_seen": 0,
|
72 |
+
"num_train_epochs": 7,
|
73 |
"save_steps": 500,
|
74 |
"total_flos": 391368939443328.0,
|
75 |
"train_batch_size": 48,
|
76 |
"trial_name": null,
|
77 |
"trial_params": {
|
78 |
+
"alpha": 0.45847029900054825,
|
79 |
+
"num_train_epochs": 7,
|
80 |
+
"temperature": 12
|
81 |
}
|
82 |
}
|
run-1/checkpoint-1500/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6c1680041c2ad9abd833a14af03572062f4c25ba5058b84c7ced516363495d8
|
3 |
size 5048
|
run-1/checkpoint-2000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f31f365c8cfca47c670086bb1001bed52e8c1d972d75627a773daece075694b
|
3 |
size 268290900
|
run-1/checkpoint-2000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ecb1cbaa7d177727fa64b2f9a62279c842a162c46259dad879df619d19139e69
|
3 |
size 536643898
|
run-1/checkpoint-2000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a711dbdbcc3fb81414c43be534b20600369a5338cb39c6b70af432fa041190bb
|
3 |
size 1064
|
run-1/checkpoint-2000/trainer_state.json
CHANGED
@@ -10,98 +10,98 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime":
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second":
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
-
"eval_accuracy": 0.
|
30 |
-
"eval_loss": 0.
|
31 |
-
"eval_runtime":
|
32 |
-
"eval_samples_per_second":
|
33 |
-
"eval_steps_per_second":
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
-
"eval_accuracy": 0.
|
39 |
-
"eval_loss": 0.
|
40 |
-
"eval_runtime": 14.
|
41 |
-
"eval_samples_per_second": 209.
|
42 |
-
"eval_steps_per_second": 26.
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
-
"grad_norm": 0.
|
48 |
-
"learning_rate": 1.
|
49 |
-
"loss": 0.
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
-
"eval_accuracy": 0.
|
55 |
-
"eval_loss": 0.
|
56 |
-
"eval_runtime": 15.
|
57 |
-
"eval_samples_per_second":
|
58 |
-
"eval_steps_per_second":
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
-
"grad_norm": 0.
|
64 |
-
"learning_rate":
|
65 |
-
"loss": 0.
|
66 |
"step": 1500
|
67 |
},
|
68 |
{
|
69 |
"epoch": 5.0,
|
70 |
-
"eval_accuracy": 0.
|
71 |
-
"eval_loss": 0.
|
72 |
-
"eval_runtime":
|
73 |
-
"eval_samples_per_second":
|
74 |
-
"eval_steps_per_second":
|
75 |
"step": 1590
|
76 |
},
|
77 |
{
|
78 |
"epoch": 6.0,
|
79 |
-
"eval_accuracy": 0.
|
80 |
-
"eval_loss": 0.
|
81 |
-
"eval_runtime": 14.
|
82 |
-
"eval_samples_per_second":
|
83 |
-
"eval_steps_per_second": 26.
|
84 |
"step": 1908
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.289308176100629,
|
88 |
-
"grad_norm": 0.
|
89 |
-
"learning_rate":
|
90 |
-
"loss": 0.
|
91 |
"step": 2000
|
92 |
}
|
93 |
],
|
94 |
"logging_steps": 500,
|
95 |
-
"max_steps":
|
96 |
"num_input_tokens_seen": 0,
|
97 |
-
"num_train_epochs":
|
98 |
"save_steps": 500,
|
99 |
"total_flos": 520991326672152.0,
|
100 |
"train_batch_size": 48,
|
101 |
"trial_name": null,
|
102 |
"trial_params": {
|
103 |
-
"alpha": 0.
|
104 |
-
"num_train_epochs":
|
105 |
-
"temperature":
|
106 |
}
|
107 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5832258064516129,
|
14 |
+
"eval_loss": 0.20071792602539062,
|
15 |
+
"eval_runtime": 13.9879,
|
16 |
+
"eval_samples_per_second": 221.619,
|
17 |
+
"eval_steps_per_second": 27.738,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5363588929176331,
|
23 |
+
"learning_rate": 1.550763701707098e-05,
|
24 |
+
"loss": 0.319,
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
+
"eval_accuracy": 0.817741935483871,
|
30 |
+
"eval_loss": 0.0997093915939331,
|
31 |
+
"eval_runtime": 14.8543,
|
32 |
+
"eval_samples_per_second": 208.694,
|
33 |
+
"eval_steps_per_second": 26.12,
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
+
"eval_accuracy": 0.8796774193548387,
|
39 |
+
"eval_loss": 0.06741480529308319,
|
40 |
+
"eval_runtime": 14.8004,
|
41 |
+
"eval_samples_per_second": 209.454,
|
42 |
+
"eval_steps_per_second": 26.216,
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
+
"grad_norm": 0.5110210180282593,
|
48 |
+
"learning_rate": 1.101527403414196e-05,
|
49 |
+
"loss": 0.1149,
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
+
"eval_accuracy": 0.8958064516129032,
|
55 |
+
"eval_loss": 0.05332249775528908,
|
56 |
+
"eval_runtime": 15.4008,
|
57 |
+
"eval_samples_per_second": 201.288,
|
58 |
+
"eval_steps_per_second": 25.193,
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
+
"grad_norm": 0.4394994378089905,
|
64 |
+
"learning_rate": 6.522911051212939e-06,
|
65 |
+
"loss": 0.076,
|
66 |
"step": 1500
|
67 |
},
|
68 |
{
|
69 |
"epoch": 5.0,
|
70 |
+
"eval_accuracy": 0.9029032258064517,
|
71 |
+
"eval_loss": 0.045641668140888214,
|
72 |
+
"eval_runtime": 15.8694,
|
73 |
+
"eval_samples_per_second": 195.344,
|
74 |
+
"eval_steps_per_second": 24.45,
|
75 |
"step": 1590
|
76 |
},
|
77 |
{
|
78 |
"epoch": 6.0,
|
79 |
+
"eval_accuracy": 0.9064516129032258,
|
80 |
+
"eval_loss": 0.04183841869235039,
|
81 |
+
"eval_runtime": 14.4904,
|
82 |
+
"eval_samples_per_second": 213.935,
|
83 |
+
"eval_steps_per_second": 26.776,
|
84 |
"step": 1908
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.289308176100629,
|
88 |
+
"grad_norm": 0.24045950174331665,
|
89 |
+
"learning_rate": 2.0305480682839176e-06,
|
90 |
+
"loss": 0.0628,
|
91 |
"step": 2000
|
92 |
}
|
93 |
],
|
94 |
"logging_steps": 500,
|
95 |
+
"max_steps": 2226,
|
96 |
"num_input_tokens_seen": 0,
|
97 |
+
"num_train_epochs": 7,
|
98 |
"save_steps": 500,
|
99 |
"total_flos": 520991326672152.0,
|
100 |
"train_batch_size": 48,
|
101 |
"trial_name": null,
|
102 |
"trial_params": {
|
103 |
+
"alpha": 0.45847029900054825,
|
104 |
+
"num_train_epochs": 7,
|
105 |
+
"temperature": 12
|
106 |
}
|
107 |
}
|
run-1/checkpoint-2000/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6c1680041c2ad9abd833a14af03572062f4c25ba5058b84c7ced516363495d8
|
3 |
size 5048
|
run-1/checkpoint-500/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9ff7ee88c51305753f34881d5d807803d2ef768fdf00a44f2240dc31c12d143
|
3 |
size 268290900
|
run-1/checkpoint-500/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed5aa0f69b2d0528b6cde1f2e0cb8bca5fa3309cd55e96158515b886914782b4
|
3 |
size 536643898
|
run-1/checkpoint-500/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c898ef245a654a7c97315fc9d4b5879d3c8228b82434bdf35669bf866597c60
|
3 |
size 1064
|
run-1/checkpoint-500/trainer_state.json
CHANGED
@@ -10,32 +10,32 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime":
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second":
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
}
|
27 |
],
|
28 |
"logging_steps": 500,
|
29 |
-
"max_steps":
|
30 |
"num_input_tokens_seen": 0,
|
31 |
-
"num_train_epochs":
|
32 |
"save_steps": 500,
|
33 |
"total_flos": 130072209152340.0,
|
34 |
"train_batch_size": 48,
|
35 |
"trial_name": null,
|
36 |
"trial_params": {
|
37 |
-
"alpha": 0.
|
38 |
-
"num_train_epochs":
|
39 |
-
"temperature":
|
40 |
}
|
41 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5832258064516129,
|
14 |
+
"eval_loss": 0.20071792602539062,
|
15 |
+
"eval_runtime": 13.9879,
|
16 |
+
"eval_samples_per_second": 221.619,
|
17 |
+
"eval_steps_per_second": 27.738,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5363588929176331,
|
23 |
+
"learning_rate": 1.550763701707098e-05,
|
24 |
+
"loss": 0.319,
|
25 |
"step": 500
|
26 |
}
|
27 |
],
|
28 |
"logging_steps": 500,
|
29 |
+
"max_steps": 2226,
|
30 |
"num_input_tokens_seen": 0,
|
31 |
+
"num_train_epochs": 7,
|
32 |
"save_steps": 500,
|
33 |
"total_flos": 130072209152340.0,
|
34 |
"train_batch_size": 48,
|
35 |
"trial_name": null,
|
36 |
"trial_params": {
|
37 |
+
"alpha": 0.45847029900054825,
|
38 |
+
"num_train_epochs": 7,
|
39 |
+
"temperature": 12
|
40 |
}
|
41 |
}
|
run-1/checkpoint-500/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6c1680041c2ad9abd833a14af03572062f4c25ba5058b84c7ced516363495d8
|
3 |
size 5048
|
run-2/checkpoint-1000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fe188f101f50feb9198e95a3dc4c7e41b408f1d4566257a86bbf8d149867e46
|
3 |
size 268290900
|
run-2/checkpoint-1000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be1f42c06e9cd80680a9cd15ef8cf85ddfb7e00ada7b80ee55d20c1adabf6d04
|
3 |
size 536643898
|
run-2/checkpoint-1000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:690dc36074a76886ef61aa14bba7f4a22546d075686d785f211a5a8037fc50f8
|
3 |
size 1064
|
run-2/checkpoint-1000/trainer_state.json
CHANGED
@@ -10,57 +10,57 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime":
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second":
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
-
"eval_accuracy": 0.
|
30 |
-
"eval_loss": 0.
|
31 |
-
"eval_runtime": 15.
|
32 |
-
"eval_samples_per_second":
|
33 |
-
"eval_steps_per_second":
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
-
"eval_accuracy": 0.
|
39 |
-
"eval_loss": 0.
|
40 |
-
"eval_runtime":
|
41 |
-
"eval_samples_per_second":
|
42 |
-
"eval_steps_per_second":
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
-
"grad_norm": 0.
|
48 |
-
"learning_rate": 1.
|
49 |
-
"loss": 0.
|
50 |
"step": 1000
|
51 |
}
|
52 |
],
|
53 |
"logging_steps": 500,
|
54 |
-
"max_steps":
|
55 |
"num_input_tokens_seen": 0,
|
56 |
-
"num_train_epochs":
|
57 |
"save_steps": 500,
|
58 |
"total_flos": 260941334653608.0,
|
59 |
"train_batch_size": 48,
|
60 |
"trial_name": null,
|
61 |
"trial_params": {
|
62 |
-
"alpha": 0.
|
63 |
-
"num_train_epochs":
|
64 |
-
"temperature":
|
65 |
}
|
66 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5706451612903226,
|
14 |
+
"eval_loss": 0.193728968501091,
|
15 |
+
"eval_runtime": 13.9503,
|
16 |
+
"eval_samples_per_second": 222.218,
|
17 |
+
"eval_steps_per_second": 27.813,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5241254568099976,
|
23 |
+
"learning_rate": 1.550763701707098e-05,
|
24 |
+
"loss": 0.3078,
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
+
"eval_accuracy": 0.8125806451612904,
|
30 |
+
"eval_loss": 0.09734208881855011,
|
31 |
+
"eval_runtime": 15.441,
|
32 |
+
"eval_samples_per_second": 200.764,
|
33 |
+
"eval_steps_per_second": 25.128,
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
+
"eval_accuracy": 0.8764516129032258,
|
39 |
+
"eval_loss": 0.06649673730134964,
|
40 |
+
"eval_runtime": 15.2951,
|
41 |
+
"eval_samples_per_second": 202.68,
|
42 |
+
"eval_steps_per_second": 25.368,
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
+
"grad_norm": 0.48943573236465454,
|
48 |
+
"learning_rate": 1.101527403414196e-05,
|
49 |
+
"loss": 0.1119,
|
50 |
"step": 1000
|
51 |
}
|
52 |
],
|
53 |
"logging_steps": 500,
|
54 |
+
"max_steps": 2226,
|
55 |
"num_input_tokens_seen": 0,
|
56 |
+
"num_train_epochs": 7,
|
57 |
"save_steps": 500,
|
58 |
"total_flos": 260941334653608.0,
|
59 |
"train_batch_size": 48,
|
60 |
"trial_name": null,
|
61 |
"trial_params": {
|
62 |
+
"alpha": 0.6214130149862211,
|
63 |
+
"num_train_epochs": 7,
|
64 |
+
"temperature": 20
|
65 |
}
|
66 |
}
|
run-2/checkpoint-1000/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4b95c2d951c8ddeee32e64101bcd9cc3cdfbbcc360a14eb2bcff65f122add0c
|
3 |
size 5048
|
run-2/checkpoint-1500/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:088b5f3d69528548b35406acaa3db91dee8992d804676296feaad4872201b67b
|
3 |
size 268290900
|
run-2/checkpoint-1500/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c40ca0e30e7620794125bfec7c53291f960cf7acc42d62bb271882880164a069
|
3 |
size 536643898
|
run-2/checkpoint-1500/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10247f8a1efaf9e3e35daf770b64f17a24e4101d4e010d50d94597b8e48a5f16
|
3 |
size 1064
|
run-2/checkpoint-1500/trainer_state.json
CHANGED
@@ -10,73 +10,73 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime":
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second":
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
-
"eval_accuracy": 0.
|
30 |
-
"eval_loss": 0.
|
31 |
-
"eval_runtime": 15.
|
32 |
-
"eval_samples_per_second":
|
33 |
-
"eval_steps_per_second":
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
-
"eval_accuracy": 0.
|
39 |
-
"eval_loss": 0.
|
40 |
-
"eval_runtime":
|
41 |
-
"eval_samples_per_second":
|
42 |
-
"eval_steps_per_second":
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
-
"grad_norm": 0.
|
48 |
-
"learning_rate": 1.
|
49 |
-
"loss": 0.
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
-
"eval_accuracy": 0.
|
55 |
-
"eval_loss": 0.
|
56 |
-
"eval_runtime": 15.
|
57 |
-
"eval_samples_per_second":
|
58 |
-
"eval_steps_per_second": 24.
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
-
"grad_norm": 0.
|
64 |
-
"learning_rate":
|
65 |
-
"loss": 0.
|
66 |
"step": 1500
|
67 |
}
|
68 |
],
|
69 |
"logging_steps": 500,
|
70 |
-
"max_steps":
|
71 |
"num_input_tokens_seen": 0,
|
72 |
-
"num_train_epochs":
|
73 |
"save_steps": 500,
|
74 |
"total_flos": 391368939443328.0,
|
75 |
"train_batch_size": 48,
|
76 |
"trial_name": null,
|
77 |
"trial_params": {
|
78 |
-
"alpha": 0.
|
79 |
-
"num_train_epochs":
|
80 |
-
"temperature":
|
81 |
}
|
82 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5706451612903226,
|
14 |
+
"eval_loss": 0.193728968501091,
|
15 |
+
"eval_runtime": 13.9503,
|
16 |
+
"eval_samples_per_second": 222.218,
|
17 |
+
"eval_steps_per_second": 27.813,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5241254568099976,
|
23 |
+
"learning_rate": 1.550763701707098e-05,
|
24 |
+
"loss": 0.3078,
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
+
"eval_accuracy": 0.8125806451612904,
|
30 |
+
"eval_loss": 0.09734208881855011,
|
31 |
+
"eval_runtime": 15.441,
|
32 |
+
"eval_samples_per_second": 200.764,
|
33 |
+
"eval_steps_per_second": 25.128,
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
+
"eval_accuracy": 0.8764516129032258,
|
39 |
+
"eval_loss": 0.06649673730134964,
|
40 |
+
"eval_runtime": 15.2951,
|
41 |
+
"eval_samples_per_second": 202.68,
|
42 |
+
"eval_steps_per_second": 25.368,
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
+
"grad_norm": 0.48943573236465454,
|
48 |
+
"learning_rate": 1.101527403414196e-05,
|
49 |
+
"loss": 0.1119,
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
+
"eval_accuracy": 0.8961290322580645,
|
55 |
+
"eval_loss": 0.052904579788446426,
|
56 |
+
"eval_runtime": 15.6918,
|
57 |
+
"eval_samples_per_second": 197.556,
|
58 |
+
"eval_steps_per_second": 24.726,
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
+
"grad_norm": 0.4308073818683624,
|
64 |
+
"learning_rate": 6.522911051212939e-06,
|
65 |
+
"loss": 0.0747,
|
66 |
"step": 1500
|
67 |
}
|
68 |
],
|
69 |
"logging_steps": 500,
|
70 |
+
"max_steps": 2226,
|
71 |
"num_input_tokens_seen": 0,
|
72 |
+
"num_train_epochs": 7,
|
73 |
"save_steps": 500,
|
74 |
"total_flos": 391368939443328.0,
|
75 |
"train_batch_size": 48,
|
76 |
"trial_name": null,
|
77 |
"trial_params": {
|
78 |
+
"alpha": 0.6214130149862211,
|
79 |
+
"num_train_epochs": 7,
|
80 |
+
"temperature": 20
|
81 |
}
|
82 |
}
|
run-2/checkpoint-1500/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4b95c2d951c8ddeee32e64101bcd9cc3cdfbbcc360a14eb2bcff65f122add0c
|
3 |
size 5048
|
run-2/checkpoint-2000/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3aaecbafc3ed40a9084ebb560e44458e3b65f97f097db4423e14c2a5e9e6d9ad
|
3 |
size 268290900
|
run-2/checkpoint-2000/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 536643898
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3fabec28558416d5dbb27164a4cea59f5112a144c5f425ef7473f151e5e7225
|
3 |
size 536643898
|
run-2/checkpoint-2000/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a711dbdbcc3fb81414c43be534b20600369a5338cb39c6b70af432fa041190bb
|
3 |
size 1064
|
run-2/checkpoint-2000/trainer_state.json
CHANGED
@@ -10,98 +10,98 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"eval_accuracy": 0.
|
14 |
-
"eval_loss": 0.
|
15 |
-
"eval_runtime":
|
16 |
-
"eval_samples_per_second":
|
17 |
-
"eval_steps_per_second":
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
-
"grad_norm": 0.
|
23 |
-
"learning_rate": 1.
|
24 |
-
"loss": 0.
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
-
"eval_accuracy": 0.
|
30 |
-
"eval_loss": 0.
|
31 |
-
"eval_runtime": 15.
|
32 |
-
"eval_samples_per_second":
|
33 |
-
"eval_steps_per_second":
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
-
"eval_accuracy": 0.
|
39 |
-
"eval_loss": 0.
|
40 |
-
"eval_runtime":
|
41 |
-
"eval_samples_per_second":
|
42 |
-
"eval_steps_per_second":
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
-
"grad_norm": 0.
|
48 |
-
"learning_rate": 1.
|
49 |
-
"loss": 0.
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
-
"eval_accuracy": 0.
|
55 |
-
"eval_loss": 0.
|
56 |
-
"eval_runtime": 15.
|
57 |
-
"eval_samples_per_second":
|
58 |
-
"eval_steps_per_second": 24.
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
-
"grad_norm": 0.
|
64 |
-
"learning_rate":
|
65 |
-
"loss": 0.
|
66 |
"step": 1500
|
67 |
},
|
68 |
{
|
69 |
"epoch": 5.0,
|
70 |
-
"eval_accuracy": 0.
|
71 |
-
"eval_loss": 0.
|
72 |
-
"eval_runtime":
|
73 |
-
"eval_samples_per_second":
|
74 |
-
"eval_steps_per_second":
|
75 |
"step": 1590
|
76 |
},
|
77 |
{
|
78 |
"epoch": 6.0,
|
79 |
-
"eval_accuracy": 0.
|
80 |
-
"eval_loss": 0.
|
81 |
-
"eval_runtime":
|
82 |
-
"eval_samples_per_second":
|
83 |
-
"eval_steps_per_second":
|
84 |
"step": 1908
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.289308176100629,
|
88 |
-
"grad_norm": 0.
|
89 |
-
"learning_rate":
|
90 |
-
"loss": 0.
|
91 |
"step": 2000
|
92 |
}
|
93 |
],
|
94 |
"logging_steps": 500,
|
95 |
-
"max_steps":
|
96 |
"num_input_tokens_seen": 0,
|
97 |
-
"num_train_epochs":
|
98 |
"save_steps": 500,
|
99 |
"total_flos": 520991326672152.0,
|
100 |
"train_batch_size": 48,
|
101 |
"trial_name": null,
|
102 |
"trial_params": {
|
103 |
-
"alpha": 0.
|
104 |
-
"num_train_epochs":
|
105 |
-
"temperature":
|
106 |
}
|
107 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"eval_accuracy": 0.5706451612903226,
|
14 |
+
"eval_loss": 0.193728968501091,
|
15 |
+
"eval_runtime": 13.9503,
|
16 |
+
"eval_samples_per_second": 222.218,
|
17 |
+
"eval_steps_per_second": 27.813,
|
18 |
"step": 318
|
19 |
},
|
20 |
{
|
21 |
"epoch": 1.5723270440251573,
|
22 |
+
"grad_norm": 0.5241254568099976,
|
23 |
+
"learning_rate": 1.550763701707098e-05,
|
24 |
+
"loss": 0.3078,
|
25 |
"step": 500
|
26 |
},
|
27 |
{
|
28 |
"epoch": 2.0,
|
29 |
+
"eval_accuracy": 0.8125806451612904,
|
30 |
+
"eval_loss": 0.09734208881855011,
|
31 |
+
"eval_runtime": 15.441,
|
32 |
+
"eval_samples_per_second": 200.764,
|
33 |
+
"eval_steps_per_second": 25.128,
|
34 |
"step": 636
|
35 |
},
|
36 |
{
|
37 |
"epoch": 3.0,
|
38 |
+
"eval_accuracy": 0.8764516129032258,
|
39 |
+
"eval_loss": 0.06649673730134964,
|
40 |
+
"eval_runtime": 15.2951,
|
41 |
+
"eval_samples_per_second": 202.68,
|
42 |
+
"eval_steps_per_second": 25.368,
|
43 |
"step": 954
|
44 |
},
|
45 |
{
|
46 |
"epoch": 3.1446540880503147,
|
47 |
+
"grad_norm": 0.48943573236465454,
|
48 |
+
"learning_rate": 1.101527403414196e-05,
|
49 |
+
"loss": 0.1119,
|
50 |
"step": 1000
|
51 |
},
|
52 |
{
|
53 |
"epoch": 4.0,
|
54 |
+
"eval_accuracy": 0.8961290322580645,
|
55 |
+
"eval_loss": 0.052904579788446426,
|
56 |
+
"eval_runtime": 15.6918,
|
57 |
+
"eval_samples_per_second": 197.556,
|
58 |
+
"eval_steps_per_second": 24.726,
|
59 |
"step": 1272
|
60 |
},
|
61 |
{
|
62 |
"epoch": 4.716981132075472,
|
63 |
+
"grad_norm": 0.4308073818683624,
|
64 |
+
"learning_rate": 6.522911051212939e-06,
|
65 |
+
"loss": 0.0747,
|
66 |
"step": 1500
|
67 |
},
|
68 |
{
|
69 |
"epoch": 5.0,
|
70 |
+
"eval_accuracy": 0.9025806451612903,
|
71 |
+
"eval_loss": 0.04545857757329941,
|
72 |
+
"eval_runtime": 16.5544,
|
73 |
+
"eval_samples_per_second": 187.262,
|
74 |
+
"eval_steps_per_second": 23.438,
|
75 |
"step": 1590
|
76 |
},
|
77 |
{
|
78 |
"epoch": 6.0,
|
79 |
+
"eval_accuracy": 0.9048387096774193,
|
80 |
+
"eval_loss": 0.04176154360175133,
|
81 |
+
"eval_runtime": 15.4351,
|
82 |
+
"eval_samples_per_second": 200.841,
|
83 |
+
"eval_steps_per_second": 25.138,
|
84 |
"step": 1908
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.289308176100629,
|
88 |
+
"grad_norm": 0.23608292639255524,
|
89 |
+
"learning_rate": 2.0305480682839176e-06,
|
90 |
+
"loss": 0.0619,
|
91 |
"step": 2000
|
92 |
}
|
93 |
],
|
94 |
"logging_steps": 500,
|
95 |
+
"max_steps": 2226,
|
96 |
"num_input_tokens_seen": 0,
|
97 |
+
"num_train_epochs": 7,
|
98 |
"save_steps": 500,
|
99 |
"total_flos": 520991326672152.0,
|
100 |
"train_batch_size": 48,
|
101 |
"trial_name": null,
|
102 |
"trial_params": {
|
103 |
+
"alpha": 0.6214130149862211,
|
104 |
+
"num_train_epochs": 7,
|
105 |
+
"temperature": 20
|
106 |
}
|
107 |
}
|