Upload folder using huggingface_hub
Browse files- README.md +4 -4
- adapter_config.json +2 -2
- adapter_model.safetensors +1 -1
- all_results.json +6 -6
- train_results.json +6 -6
- trainer_log.jsonl +4 -37
- trainer_state.json +21 -252
- training_args.bin +2 -2
- training_loss.png +0 -0
README.md
CHANGED
@@ -7,14 +7,14 @@ tags:
|
|
7 |
- generated_from_trainer
|
8 |
base_model: microsoft/phi-2
|
9 |
model-index:
|
10 |
-
- name:
|
11 |
results: []
|
12 |
---
|
13 |
|
14 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
15 |
should probably proofread and complete it, then remove this comment. -->
|
16 |
|
17 |
-
#
|
18 |
|
19 |
This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the trivia dataset.
|
20 |
|
@@ -35,7 +35,7 @@ More information needed
|
|
35 |
### Training hyperparameters
|
36 |
|
37 |
The following hyperparameters were used during training:
|
38 |
-
- learning_rate: 5e-
|
39 |
- train_batch_size: 2
|
40 |
- eval_batch_size: 8
|
41 |
- seed: 42
|
@@ -43,7 +43,7 @@ The following hyperparameters were used during training:
|
|
43 |
- total_train_batch_size: 16
|
44 |
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
45 |
- lr_scheduler_type: cosine
|
46 |
-
- num_epochs:
|
47 |
|
48 |
### Training results
|
49 |
|
|
|
7 |
- generated_from_trainer
|
8 |
base_model: microsoft/phi-2
|
9 |
model-index:
|
10 |
+
- name: glaiveNew
|
11 |
results: []
|
12 |
---
|
13 |
|
14 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
15 |
should probably proofread and complete it, then remove this comment. -->
|
16 |
|
17 |
+
# glaiveNew
|
18 |
|
19 |
This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the trivia dataset.
|
20 |
|
|
|
35 |
### Training hyperparameters
|
36 |
|
37 |
The following hyperparameters were used during training:
|
38 |
+
- learning_rate: 5e-05
|
39 |
- train_batch_size: 2
|
40 |
- eval_batch_size: 8
|
41 |
- seed: 42
|
|
|
43 |
- total_train_batch_size: 16
|
44 |
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
45 |
- lr_scheduler_type: cosine
|
46 |
+
- num_epochs: 3.0
|
47 |
|
48 |
### Training results
|
49 |
|
adapter_config.json
CHANGED
@@ -20,8 +20,8 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"
|
24 |
-
"
|
25 |
],
|
26 |
"task_type": "CAUSAL_LM",
|
27 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
+
"q_proj",
|
24 |
+
"v_proj"
|
25 |
],
|
26 |
"task_type": "CAUSAL_LM",
|
27 |
"use_dora": false,
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 10502640
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9abcb87ca0fc0f59556142c682feb9092e98296cd75e9449e28864e34339661a
|
3 |
size 10502640
|
all_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"epoch":
|
3 |
-
"total_flos":
|
4 |
-
"train_loss":
|
5 |
-
"train_runtime":
|
6 |
-
"train_samples_per_second": 5.
|
7 |
-
"train_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 2.88,
|
3 |
+
"total_flos": 133622157312000.0,
|
4 |
+
"train_loss": 6.60821893480089,
|
5 |
+
"train_runtime": 59.777,
|
6 |
+
"train_samples_per_second": 5.019,
|
7 |
+
"train_steps_per_second": 0.301
|
8 |
}
|
train_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"epoch":
|
3 |
-
"total_flos":
|
4 |
-
"train_loss":
|
5 |
-
"train_runtime":
|
6 |
-
"train_samples_per_second": 5.
|
7 |
-
"train_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 2.88,
|
3 |
+
"total_flos": 133622157312000.0,
|
4 |
+
"train_loss": 6.60821893480089,
|
5 |
+
"train_runtime": 59.777,
|
6 |
+
"train_samples_per_second": 5.019,
|
7 |
+
"train_steps_per_second": 0.301
|
8 |
}
|
trainer_log.jsonl
CHANGED
@@ -1,37 +1,4 @@
|
|
1 |
-
{"current_steps": 5, "total_steps":
|
2 |
-
{"current_steps": 10, "total_steps":
|
3 |
-
{"current_steps": 15, "total_steps":
|
4 |
-
{"current_steps":
|
5 |
-
{"current_steps": 25, "total_steps": 180, "loss": 7.1004, "learning_rate": 4.765769467591626e-10, "epoch": 4.0, "percentage": 13.89, "elapsed_time": "0:01:21", "remaining_time": "0:08:27"}
|
6 |
-
{"current_steps": 30, "total_steps": 180, "loss": 6.9577, "learning_rate": 4.665063509461097e-10, "epoch": 4.8, "percentage": 16.67, "elapsed_time": "0:01:38", "remaining_time": "0:08:11"}
|
7 |
-
{"current_steps": 35, "total_steps": 180, "loss": 7.1725, "learning_rate": 4.54788011072248e-10, "epoch": 5.6, "percentage": 19.44, "elapsed_time": "0:01:54", "remaining_time": "0:07:55"}
|
8 |
-
{"current_steps": 40, "total_steps": 180, "loss": 7.1535, "learning_rate": 4.415111107797445e-10, "epoch": 6.4, "percentage": 22.22, "elapsed_time": "0:02:11", "remaining_time": "0:07:39"}
|
9 |
-
{"current_steps": 45, "total_steps": 180, "loss": 6.9961, "learning_rate": 4.267766952966369e-10, "epoch": 7.2, "percentage": 25.0, "elapsed_time": "0:02:28", "remaining_time": "0:07:24"}
|
10 |
-
{"current_steps": 50, "total_steps": 180, "loss": 7.1581, "learning_rate": 4.106969024216348e-10, "epoch": 8.0, "percentage": 27.78, "elapsed_time": "0:02:44", "remaining_time": "0:07:08"}
|
11 |
-
{"current_steps": 55, "total_steps": 180, "loss": 7.3208, "learning_rate": 3.933941090877615e-10, "epoch": 8.8, "percentage": 30.56, "elapsed_time": "0:03:02", "remaining_time": "0:06:54"}
|
12 |
-
{"current_steps": 60, "total_steps": 180, "loss": 6.7152, "learning_rate": 3.7500000000000005e-10, "epoch": 9.6, "percentage": 33.33, "elapsed_time": "0:03:19", "remaining_time": "0:06:39"}
|
13 |
-
{"current_steps": 65, "total_steps": 180, "loss": 7.1168, "learning_rate": 3.556545654351749e-10, "epoch": 10.4, "percentage": 36.11, "elapsed_time": "0:03:36", "remaining_time": "0:06:23"}
|
14 |
-
{"current_steps": 70, "total_steps": 180, "loss": 7.143, "learning_rate": 3.3550503583141725e-10, "epoch": 11.2, "percentage": 38.89, "elapsed_time": "0:03:52", "remaining_time": "0:06:06"}
|
15 |
-
{"current_steps": 75, "total_steps": 180, "loss": 7.2724, "learning_rate": 3.147047612756302e-10, "epoch": 12.0, "percentage": 41.67, "elapsed_time": "0:04:09", "remaining_time": "0:05:49"}
|
16 |
-
{"current_steps": 80, "total_steps": 180, "loss": 6.995, "learning_rate": 2.9341204441673265e-10, "epoch": 12.8, "percentage": 44.44, "elapsed_time": "0:04:26", "remaining_time": "0:05:32"}
|
17 |
-
{"current_steps": 85, "total_steps": 180, "loss": 7.1134, "learning_rate": 2.717889356869146e-10, "epoch": 13.6, "percentage": 47.22, "elapsed_time": "0:04:42", "remaining_time": "0:05:15"}
|
18 |
-
{"current_steps": 90, "total_steps": 180, "loss": 7.2029, "learning_rate": 2.5e-10, "epoch": 14.4, "percentage": 50.0, "elapsed_time": "0:04:59", "remaining_time": "0:04:59"}
|
19 |
-
{"current_steps": 95, "total_steps": 180, "loss": 7.4673, "learning_rate": 2.2821106431308546e-10, "epoch": 15.2, "percentage": 52.78, "elapsed_time": "0:05:15", "remaining_time": "0:04:42"}
|
20 |
-
{"current_steps": 100, "total_steps": 180, "loss": 6.8563, "learning_rate": 2.0658795558326743e-10, "epoch": 16.0, "percentage": 55.56, "elapsed_time": "0:05:31", "remaining_time": "0:04:25"}
|
21 |
-
{"current_steps": 105, "total_steps": 180, "loss": 7.0683, "learning_rate": 1.852952387243698e-10, "epoch": 16.8, "percentage": 58.33, "elapsed_time": "0:05:48", "remaining_time": "0:04:08"}
|
22 |
-
{"current_steps": 110, "total_steps": 180, "loss": 7.1976, "learning_rate": 1.6449496416858284e-10, "epoch": 17.6, "percentage": 61.11, "elapsed_time": "0:06:04", "remaining_time": "0:03:52"}
|
23 |
-
{"current_steps": 115, "total_steps": 180, "loss": 7.1575, "learning_rate": 1.443454345648252e-10, "epoch": 18.4, "percentage": 63.89, "elapsed_time": "0:06:20", "remaining_time": "0:03:35"}
|
24 |
-
{"current_steps": 120, "total_steps": 180, "loss": 7.073, "learning_rate": 1.2500000000000006e-10, "epoch": 19.2, "percentage": 66.67, "elapsed_time": "0:06:37", "remaining_time": "0:03:18"}
|
25 |
-
{"current_steps": 125, "total_steps": 180, "loss": 7.0932, "learning_rate": 1.0660589091223855e-10, "epoch": 20.0, "percentage": 69.44, "elapsed_time": "0:06:53", "remaining_time": "0:03:01"}
|
26 |
-
{"current_steps": 130, "total_steps": 180, "loss": 7.1324, "learning_rate": 8.930309757836516e-11, "epoch": 20.8, "percentage": 72.22, "elapsed_time": "0:07:09", "remaining_time": "0:02:45"}
|
27 |
-
{"current_steps": 135, "total_steps": 180, "loss": 7.1434, "learning_rate": 7.322330470336314e-11, "epoch": 21.6, "percentage": 75.0, "elapsed_time": "0:07:27", "remaining_time": "0:02:29"}
|
28 |
-
{"current_steps": 140, "total_steps": 180, "loss": 7.0423, "learning_rate": 5.848888922025552e-11, "epoch": 22.4, "percentage": 77.78, "elapsed_time": "0:07:42", "remaining_time": "0:02:12"}
|
29 |
-
{"current_steps": 145, "total_steps": 180, "loss": 7.3638, "learning_rate": 4.5211988927752025e-11, "epoch": 23.2, "percentage": 80.56, "elapsed_time": "0:07:59", "remaining_time": "0:01:55"}
|
30 |
-
{"current_steps": 150, "total_steps": 180, "loss": 6.8454, "learning_rate": 3.3493649053890324e-11, "epoch": 24.0, "percentage": 83.33, "elapsed_time": "0:08:15", "remaining_time": "0:01:39"}
|
31 |
-
{"current_steps": 155, "total_steps": 180, "loss": 7.1881, "learning_rate": 2.3423053240837516e-11, "epoch": 24.8, "percentage": 86.11, "elapsed_time": "0:08:31", "remaining_time": "0:01:22"}
|
32 |
-
{"current_steps": 160, "total_steps": 180, "loss": 7.1249, "learning_rate": 1.5076844803522922e-11, "epoch": 25.6, "percentage": 88.89, "elapsed_time": "0:08:48", "remaining_time": "0:01:06"}
|
33 |
-
{"current_steps": 165, "total_steps": 180, "loss": 7.1924, "learning_rate": 8.51854342773295e-12, "epoch": 26.4, "percentage": 91.67, "elapsed_time": "0:09:04", "remaining_time": "0:00:49"}
|
34 |
-
{"current_steps": 170, "total_steps": 180, "loss": 6.9059, "learning_rate": 3.798061746947995e-12, "epoch": 27.2, "percentage": 94.44, "elapsed_time": "0:09:20", "remaining_time": "0:00:32"}
|
35 |
-
{"current_steps": 175, "total_steps": 180, "loss": 7.2371, "learning_rate": 9.513254770636138e-13, "epoch": 28.0, "percentage": 97.22, "elapsed_time": "0:09:37", "remaining_time": "0:00:16"}
|
36 |
-
{"current_steps": 180, "total_steps": 180, "loss": 7.0767, "learning_rate": 0.0, "epoch": 28.8, "percentage": 100.0, "elapsed_time": "0:09:53", "remaining_time": "0:00:00"}
|
37 |
-
{"current_steps": 180, "total_steps": 180, "epoch": 28.8, "percentage": 100.0, "elapsed_time": "0:09:53", "remaining_time": "0:00:00"}
|
|
|
1 |
+
{"current_steps": 5, "total_steps": 18, "loss": 7.2633, "learning_rate": 4.1069690242163484e-05, "epoch": 0.8, "percentage": 27.78, "elapsed_time": "0:00:17", "remaining_time": "0:00:45"}
|
2 |
+
{"current_steps": 10, "total_steps": 18, "loss": 6.3067, "learning_rate": 2.0658795558326743e-05, "epoch": 1.6, "percentage": 55.56, "elapsed_time": "0:00:33", "remaining_time": "0:00:27"}
|
3 |
+
{"current_steps": 15, "total_steps": 18, "loss": 6.4305, "learning_rate": 3.3493649053890326e-06, "epoch": 2.4, "percentage": 83.33, "elapsed_time": "0:00:50", "remaining_time": "0:00:10"}
|
4 |
+
{"current_steps": 18, "total_steps": 18, "epoch": 2.88, "percentage": 100.0, "elapsed_time": "0:00:59", "remaining_time": "0:00:00"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
trainer_state.json
CHANGED
@@ -1,281 +1,50 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch":
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.8,
|
13 |
-
"grad_norm": 2.
|
14 |
-
"learning_rate": 4.
|
15 |
-
"loss": 7.
|
16 |
"step": 5
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.6,
|
20 |
-
"grad_norm":
|
21 |
-
"learning_rate":
|
22 |
-
"loss": 6.
|
23 |
"step": 10
|
24 |
},
|
25 |
{
|
26 |
"epoch": 2.4,
|
27 |
-
"grad_norm":
|
28 |
-
"learning_rate":
|
29 |
-
"loss":
|
30 |
"step": 15
|
31 |
},
|
32 |
{
|
33 |
-
"epoch":
|
34 |
-
"
|
35 |
-
"
|
36 |
-
"
|
37 |
-
"
|
38 |
-
|
39 |
-
|
40 |
-
"epoch": 4.0,
|
41 |
-
"grad_norm": 3.2548422813415527,
|
42 |
-
"learning_rate": 4.765769467591626e-10,
|
43 |
-
"loss": 7.1004,
|
44 |
-
"step": 25
|
45 |
-
},
|
46 |
-
{
|
47 |
-
"epoch": 4.8,
|
48 |
-
"grad_norm": 2.268507480621338,
|
49 |
-
"learning_rate": 4.665063509461097e-10,
|
50 |
-
"loss": 6.9577,
|
51 |
-
"step": 30
|
52 |
-
},
|
53 |
-
{
|
54 |
-
"epoch": 5.6,
|
55 |
-
"grad_norm": 2.739196300506592,
|
56 |
-
"learning_rate": 4.54788011072248e-10,
|
57 |
-
"loss": 7.1725,
|
58 |
-
"step": 35
|
59 |
-
},
|
60 |
-
{
|
61 |
-
"epoch": 6.4,
|
62 |
-
"grad_norm": 2.399449348449707,
|
63 |
-
"learning_rate": 4.415111107797445e-10,
|
64 |
-
"loss": 7.1535,
|
65 |
-
"step": 40
|
66 |
-
},
|
67 |
-
{
|
68 |
-
"epoch": 7.2,
|
69 |
-
"grad_norm": 2.798766613006592,
|
70 |
-
"learning_rate": 4.267766952966369e-10,
|
71 |
-
"loss": 6.9961,
|
72 |
-
"step": 45
|
73 |
-
},
|
74 |
-
{
|
75 |
-
"epoch": 8.0,
|
76 |
-
"grad_norm": 2.742884635925293,
|
77 |
-
"learning_rate": 4.106969024216348e-10,
|
78 |
-
"loss": 7.1581,
|
79 |
-
"step": 50
|
80 |
-
},
|
81 |
-
{
|
82 |
-
"epoch": 8.8,
|
83 |
-
"grad_norm": 3.0411534309387207,
|
84 |
-
"learning_rate": 3.933941090877615e-10,
|
85 |
-
"loss": 7.3208,
|
86 |
-
"step": 55
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"epoch": 9.6,
|
90 |
-
"grad_norm": 2.590532064437866,
|
91 |
-
"learning_rate": 3.7500000000000005e-10,
|
92 |
-
"loss": 6.7152,
|
93 |
-
"step": 60
|
94 |
-
},
|
95 |
-
{
|
96 |
-
"epoch": 10.4,
|
97 |
-
"grad_norm": 2.6501505374908447,
|
98 |
-
"learning_rate": 3.556545654351749e-10,
|
99 |
-
"loss": 7.1168,
|
100 |
-
"step": 65
|
101 |
-
},
|
102 |
-
{
|
103 |
-
"epoch": 11.2,
|
104 |
-
"grad_norm": 2.6842129230499268,
|
105 |
-
"learning_rate": 3.3550503583141725e-10,
|
106 |
-
"loss": 7.143,
|
107 |
-
"step": 70
|
108 |
-
},
|
109 |
-
{
|
110 |
-
"epoch": 12.0,
|
111 |
-
"grad_norm": 2.7974400520324707,
|
112 |
-
"learning_rate": 3.147047612756302e-10,
|
113 |
-
"loss": 7.2724,
|
114 |
-
"step": 75
|
115 |
-
},
|
116 |
-
{
|
117 |
-
"epoch": 12.8,
|
118 |
-
"grad_norm": 2.68391489982605,
|
119 |
-
"learning_rate": 2.9341204441673265e-10,
|
120 |
-
"loss": 6.995,
|
121 |
-
"step": 80
|
122 |
-
},
|
123 |
-
{
|
124 |
-
"epoch": 13.6,
|
125 |
-
"grad_norm": 3.0854671001434326,
|
126 |
-
"learning_rate": 2.717889356869146e-10,
|
127 |
-
"loss": 7.1134,
|
128 |
-
"step": 85
|
129 |
-
},
|
130 |
-
{
|
131 |
-
"epoch": 14.4,
|
132 |
-
"grad_norm": 2.772061347961426,
|
133 |
-
"learning_rate": 2.5e-10,
|
134 |
-
"loss": 7.2029,
|
135 |
-
"step": 90
|
136 |
-
},
|
137 |
-
{
|
138 |
-
"epoch": 15.2,
|
139 |
-
"grad_norm": 2.6627767086029053,
|
140 |
-
"learning_rate": 2.2821106431308546e-10,
|
141 |
-
"loss": 7.4673,
|
142 |
-
"step": 95
|
143 |
-
},
|
144 |
-
{
|
145 |
-
"epoch": 16.0,
|
146 |
-
"grad_norm": 2.7043120861053467,
|
147 |
-
"learning_rate": 2.0658795558326743e-10,
|
148 |
-
"loss": 6.8563,
|
149 |
-
"step": 100
|
150 |
-
},
|
151 |
-
{
|
152 |
-
"epoch": 16.8,
|
153 |
-
"grad_norm": 2.4765264987945557,
|
154 |
-
"learning_rate": 1.852952387243698e-10,
|
155 |
-
"loss": 7.0683,
|
156 |
-
"step": 105
|
157 |
-
},
|
158 |
-
{
|
159 |
-
"epoch": 17.6,
|
160 |
-
"grad_norm": 2.775627613067627,
|
161 |
-
"learning_rate": 1.6449496416858284e-10,
|
162 |
-
"loss": 7.1976,
|
163 |
-
"step": 110
|
164 |
-
},
|
165 |
-
{
|
166 |
-
"epoch": 18.4,
|
167 |
-
"grad_norm": 2.3891263008117676,
|
168 |
-
"learning_rate": 1.443454345648252e-10,
|
169 |
-
"loss": 7.1575,
|
170 |
-
"step": 115
|
171 |
-
},
|
172 |
-
{
|
173 |
-
"epoch": 19.2,
|
174 |
-
"grad_norm": 2.5396955013275146,
|
175 |
-
"learning_rate": 1.2500000000000006e-10,
|
176 |
-
"loss": 7.073,
|
177 |
-
"step": 120
|
178 |
-
},
|
179 |
-
{
|
180 |
-
"epoch": 20.0,
|
181 |
-
"grad_norm": 2.4532394409179688,
|
182 |
-
"learning_rate": 1.0660589091223855e-10,
|
183 |
-
"loss": 7.0932,
|
184 |
-
"step": 125
|
185 |
-
},
|
186 |
-
{
|
187 |
-
"epoch": 20.8,
|
188 |
-
"grad_norm": 2.822531223297119,
|
189 |
-
"learning_rate": 8.930309757836516e-11,
|
190 |
-
"loss": 7.1324,
|
191 |
-
"step": 130
|
192 |
-
},
|
193 |
-
{
|
194 |
-
"epoch": 21.6,
|
195 |
-
"grad_norm": 2.5621225833892822,
|
196 |
-
"learning_rate": 7.322330470336314e-11,
|
197 |
-
"loss": 7.1434,
|
198 |
-
"step": 135
|
199 |
-
},
|
200 |
-
{
|
201 |
-
"epoch": 22.4,
|
202 |
-
"grad_norm": 2.63484263420105,
|
203 |
-
"learning_rate": 5.848888922025552e-11,
|
204 |
-
"loss": 7.0423,
|
205 |
-
"step": 140
|
206 |
-
},
|
207 |
-
{
|
208 |
-
"epoch": 23.2,
|
209 |
-
"grad_norm": 2.882169246673584,
|
210 |
-
"learning_rate": 4.5211988927752025e-11,
|
211 |
-
"loss": 7.3638,
|
212 |
-
"step": 145
|
213 |
-
},
|
214 |
-
{
|
215 |
-
"epoch": 24.0,
|
216 |
-
"grad_norm": 2.356477975845337,
|
217 |
-
"learning_rate": 3.3493649053890324e-11,
|
218 |
-
"loss": 6.8454,
|
219 |
-
"step": 150
|
220 |
-
},
|
221 |
-
{
|
222 |
-
"epoch": 24.8,
|
223 |
-
"grad_norm": 3.1740143299102783,
|
224 |
-
"learning_rate": 2.3423053240837516e-11,
|
225 |
-
"loss": 7.1881,
|
226 |
-
"step": 155
|
227 |
-
},
|
228 |
-
{
|
229 |
-
"epoch": 25.6,
|
230 |
-
"grad_norm": 2.634425640106201,
|
231 |
-
"learning_rate": 1.5076844803522922e-11,
|
232 |
-
"loss": 7.1249,
|
233 |
-
"step": 160
|
234 |
-
},
|
235 |
-
{
|
236 |
-
"epoch": 26.4,
|
237 |
-
"grad_norm": 2.412172317504883,
|
238 |
-
"learning_rate": 8.51854342773295e-12,
|
239 |
-
"loss": 7.1924,
|
240 |
-
"step": 165
|
241 |
-
},
|
242 |
-
{
|
243 |
-
"epoch": 27.2,
|
244 |
-
"grad_norm": 2.655557870864868,
|
245 |
-
"learning_rate": 3.798061746947995e-12,
|
246 |
-
"loss": 6.9059,
|
247 |
-
"step": 170
|
248 |
-
},
|
249 |
-
{
|
250 |
-
"epoch": 28.0,
|
251 |
-
"grad_norm": 2.859827756881714,
|
252 |
-
"learning_rate": 9.513254770636138e-13,
|
253 |
-
"loss": 7.2371,
|
254 |
-
"step": 175
|
255 |
-
},
|
256 |
-
{
|
257 |
-
"epoch": 28.8,
|
258 |
-
"grad_norm": 2.7501277923583984,
|
259 |
-
"learning_rate": 0.0,
|
260 |
-
"loss": 7.0767,
|
261 |
-
"step": 180
|
262 |
-
},
|
263 |
-
{
|
264 |
-
"epoch": 28.8,
|
265 |
-
"step": 180,
|
266 |
-
"total_flos": 1335967054725120.0,
|
267 |
-
"train_loss": 7.109028116861979,
|
268 |
-
"train_runtime": 593.1625,
|
269 |
-
"train_samples_per_second": 5.058,
|
270 |
-
"train_steps_per_second": 0.303
|
271 |
}
|
272 |
],
|
273 |
"logging_steps": 5,
|
274 |
-
"max_steps":
|
275 |
"num_input_tokens_seen": 0,
|
276 |
-
"num_train_epochs":
|
277 |
"save_steps": 100,
|
278 |
-
"total_flos":
|
279 |
"train_batch_size": 2,
|
280 |
"trial_name": null,
|
281 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 2.88,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 18,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.8,
|
13 |
+
"grad_norm": 2.860818386077881,
|
14 |
+
"learning_rate": 4.1069690242163484e-05,
|
15 |
+
"loss": 7.2633,
|
16 |
"step": 5
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.6,
|
20 |
+
"grad_norm": 4.06754207611084,
|
21 |
+
"learning_rate": 2.0658795558326743e-05,
|
22 |
+
"loss": 6.3067,
|
23 |
"step": 10
|
24 |
},
|
25 |
{
|
26 |
"epoch": 2.4,
|
27 |
+
"grad_norm": 4.378876686096191,
|
28 |
+
"learning_rate": 3.3493649053890326e-06,
|
29 |
+
"loss": 6.4305,
|
30 |
"step": 15
|
31 |
},
|
32 |
{
|
33 |
+
"epoch": 2.88,
|
34 |
+
"step": 18,
|
35 |
+
"total_flos": 133622157312000.0,
|
36 |
+
"train_loss": 6.60821893480089,
|
37 |
+
"train_runtime": 59.777,
|
38 |
+
"train_samples_per_second": 5.019,
|
39 |
+
"train_steps_per_second": 0.301
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
}
|
41 |
],
|
42 |
"logging_steps": 5,
|
43 |
+
"max_steps": 18,
|
44 |
"num_input_tokens_seen": 0,
|
45 |
+
"num_train_epochs": 3,
|
46 |
"save_steps": 100,
|
47 |
+
"total_flos": 133622157312000.0,
|
48 |
"train_batch_size": 2,
|
49 |
"trial_name": null,
|
50 |
"trial_params": null
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:877561c09d9ec252be597601fb20cc3870d9bf16887509642b7e8821a1e2e78c
|
3 |
+
size 5176
|
training_loss.png
CHANGED
![]() |
![]() |