Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- adapter_config.json +3 -3
- adapter_model.safetensors +1 -1
- checkpoint-10/adapter_config.json +3 -3
- checkpoint-10/adapter_model.safetensors +1 -1
- checkpoint-10/optimizer.pt +1 -1
- checkpoint-10/scheduler.pt +1 -1
- checkpoint-10/trainer_state.json +10 -10
- checkpoint-10/training_args.bin +1 -1
- checkpoint-100/README.md +1 -1
- checkpoint-100/adapter_config.json +3 -3
- checkpoint-100/adapter_model.safetensors +1 -1
- checkpoint-100/optimizer.pt +1 -1
- checkpoint-100/scheduler.pt +1 -1
- checkpoint-100/trainer_state.json +73 -73
- checkpoint-100/training_args.bin +1 -1
- checkpoint-110/README.md +1 -1
- checkpoint-110/adapter_config.json +3 -3
- checkpoint-110/adapter_model.safetensors +1 -1
- checkpoint-110/optimizer.pt +1 -1
- checkpoint-110/scheduler.pt +1 -1
- checkpoint-110/trainer_state.json +79 -79
- checkpoint-110/training_args.bin +1 -1
- checkpoint-120/README.md +1 -1
- checkpoint-120/adapter_config.json +3 -3
- checkpoint-120/adapter_model.safetensors +1 -1
- checkpoint-120/optimizer.pt +1 -1
- checkpoint-120/scheduler.pt +1 -1
- checkpoint-120/trainer_state.json +87 -87
- checkpoint-120/training_args.bin +1 -1
- checkpoint-20/adapter_config.json +3 -3
- checkpoint-20/adapter_model.safetensors +1 -1
- checkpoint-20/optimizer.pt +1 -1
- checkpoint-20/scheduler.pt +1 -1
- checkpoint-20/trainer_state.json +17 -17
- checkpoint-20/training_args.bin +1 -1
- checkpoint-30/adapter_config.json +3 -3
- checkpoint-30/adapter_model.safetensors +1 -1
- checkpoint-30/optimizer.pt +1 -1
- checkpoint-30/scheduler.pt +1 -1
- checkpoint-30/trainer_state.json +23 -23
- checkpoint-30/training_args.bin +1 -1
- checkpoint-40/adapter_config.json +3 -3
- checkpoint-40/adapter_model.safetensors +1 -1
- checkpoint-40/optimizer.pt +1 -1
- checkpoint-40/scheduler.pt +1 -1
- checkpoint-40/trainer_state.json +29 -29
- checkpoint-40/training_args.bin +1 -1
- checkpoint-50/adapter_config.json +3 -3
- checkpoint-50/adapter_model.safetensors +1 -1
- checkpoint-50/optimizer.pt +1 -1
adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"k_proj",
|
24 |
"q_proj",
|
25 |
-
"
|
26 |
-
"
|
|
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f90c1d3ed853f5e7e29d9c0d39bdab0cc26bd4d4ea5fbb602291f4c783b23d04
|
3 |
size 67143296
|
checkpoint-10/adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"k_proj",
|
24 |
"q_proj",
|
25 |
-
"
|
26 |
-
"
|
|
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
checkpoint-10/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98e20323c453a19bc80c8d7d2e78dd31c294b2902e1e5c93d6fc3bb60807b9a9
|
3 |
size 67143296
|
checkpoint-10/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 134433530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e8ca82274d47b8f0c29a9ec38dc99330e73ae5a77ba65837b1561b4da10e245
|
3 |
size 134433530
|
checkpoint-10/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47a859c720f996d82c8b4e6126df0e86212eb2bb6933303af0eacc71bf5de32f
|
3 |
size 1064
|
checkpoint-10/trainer_state.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-10",
|
4 |
"epoch": 1.1111111111111112,
|
5 |
"eval_steps": 10,
|
@@ -10,24 +10,24 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
-
"grad_norm": 0.
|
14 |
-
"learning_rate": 0.
|
15 |
-
"loss": 2.
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
-
"eval_loss": 1.
|
21 |
-
"eval_runtime":
|
22 |
-
"eval_samples_per_second": 1.
|
23 |
-
"eval_steps_per_second": 0.
|
24 |
"step": 10
|
25 |
}
|
26 |
],
|
27 |
"logging_steps": 10,
|
28 |
-
"max_steps":
|
29 |
"num_input_tokens_seen": 0,
|
30 |
-
"num_train_epochs":
|
31 |
"save_steps": 10,
|
32 |
"stateful_callbacks": {
|
33 |
"EarlyStoppingCallback": {
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.729261875152588,
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-10",
|
4 |
"epoch": 1.1111111111111112,
|
5 |
"eval_steps": 10,
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
+
"grad_norm": 0.022457197308540344,
|
14 |
+
"learning_rate": 0.0001925925925925926,
|
15 |
+
"loss": 2.0406,
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
+
"eval_loss": 1.729261875152588,
|
21 |
+
"eval_runtime": 34.8953,
|
22 |
+
"eval_samples_per_second": 1.032,
|
23 |
+
"eval_steps_per_second": 0.143,
|
24 |
"step": 10
|
25 |
}
|
26 |
],
|
27 |
"logging_steps": 10,
|
28 |
+
"max_steps": 270,
|
29 |
"num_input_tokens_seen": 0,
|
30 |
+
"num_train_epochs": 30,
|
31 |
"save_steps": 10,
|
32 |
"stateful_callbacks": {
|
33 |
"EarlyStoppingCallback": {
|
checkpoint-10/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5112
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
|
3 |
size 5112
|
checkpoint-100/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
library_name: peft
|
3 |
base_model: TheBloke/Llama-2-7B-fp16
|
|
|
4 |
---
|
5 |
|
6 |
# Model Card for Model ID
|
|
|
1 |
---
|
|
|
2 |
base_model: TheBloke/Llama-2-7B-fp16
|
3 |
+
library_name: peft
|
4 |
---
|
5 |
|
6 |
# Model Card for Model ID
|
checkpoint-100/adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"
|
24 |
"v_proj",
|
25 |
-
"
|
26 |
-
"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
+
"q_proj",
|
24 |
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
checkpoint-100/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d0047624e79262540578984e129869e2d6934ddd722dbc7dfc5f942e628b000
|
3 |
size 67143296
|
checkpoint-100/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 134433530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df94a2231d5455b24c3ce8db2477572330be1aa99e33c02b4ac96351f08c9fc8
|
3 |
size 134433530
|
checkpoint-100/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:127e093ac25e89499f96e10a77287e7041566fb667c0634628ee414b8d0443ea
|
3 |
size 1064
|
checkpoint-100/trainer_state.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-90",
|
4 |
"epoch": 11.11111111111111,
|
5 |
"eval_steps": 10,
|
@@ -10,159 +10,159 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
-
"grad_norm": 0.
|
14 |
-
"learning_rate": 0.
|
15 |
-
"loss": 2.
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
-
"eval_loss": 1.
|
21 |
-
"eval_runtime": 34.
|
22 |
-
"eval_samples_per_second": 1.
|
23 |
-
"eval_steps_per_second": 0.
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
-
"grad_norm": 0.
|
29 |
-
"learning_rate": 0.
|
30 |
-
"loss": 1.
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
-
"eval_loss": 1.
|
36 |
-
"eval_runtime": 34.
|
37 |
-
"eval_samples_per_second": 1.
|
38 |
-
"eval_steps_per_second": 0.
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
-
"grad_norm": 0.
|
44 |
-
"learning_rate": 0.
|
45 |
-
"loss": 1.
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
-
"eval_loss": 1.
|
51 |
-
"eval_runtime": 34.
|
52 |
-
"eval_samples_per_second": 1.
|
53 |
-
"eval_steps_per_second": 0.
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
"epoch": 4.444444444444445,
|
58 |
-
"grad_norm": 0.
|
59 |
-
"learning_rate": 0.
|
60 |
-
"loss": 1.
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
"epoch": 4.444444444444445,
|
65 |
-
"eval_loss": 1.
|
66 |
-
"eval_runtime": 34.
|
67 |
-
"eval_samples_per_second": 1.
|
68 |
-
"eval_steps_per_second": 0.
|
69 |
"step": 40
|
70 |
},
|
71 |
{
|
72 |
"epoch": 5.555555555555555,
|
73 |
-
"grad_norm": 0.
|
74 |
-
"learning_rate": 0.
|
75 |
-
"loss": 1.
|
76 |
"step": 50
|
77 |
},
|
78 |
{
|
79 |
"epoch": 5.555555555555555,
|
80 |
-
"eval_loss": 1.
|
81 |
-
"eval_runtime": 34.
|
82 |
-
"eval_samples_per_second": 1.
|
83 |
-
"eval_steps_per_second": 0.
|
84 |
"step": 50
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.666666666666667,
|
88 |
-
"grad_norm": 0.
|
89 |
-
"learning_rate": 0.
|
90 |
-
"loss": 1.
|
91 |
"step": 60
|
92 |
},
|
93 |
{
|
94 |
"epoch": 6.666666666666667,
|
95 |
-
"eval_loss": 1.
|
96 |
-
"eval_runtime": 34.
|
97 |
-
"eval_samples_per_second": 1.
|
98 |
-
"eval_steps_per_second": 0.
|
99 |
"step": 60
|
100 |
},
|
101 |
{
|
102 |
"epoch": 7.777777777777778,
|
103 |
-
"grad_norm": 0.
|
104 |
-
"learning_rate":
|
105 |
-
"loss": 0.
|
106 |
"step": 70
|
107 |
},
|
108 |
{
|
109 |
"epoch": 7.777777777777778,
|
110 |
-
"eval_loss": 1.
|
111 |
-
"eval_runtime": 34.
|
112 |
-
"eval_samples_per_second": 1.
|
113 |
-
"eval_steps_per_second": 0.
|
114 |
"step": 70
|
115 |
},
|
116 |
{
|
117 |
"epoch": 8.88888888888889,
|
118 |
-
"grad_norm": 0.
|
119 |
-
"learning_rate":
|
120 |
-
"loss": 0.
|
121 |
"step": 80
|
122 |
},
|
123 |
{
|
124 |
"epoch": 8.88888888888889,
|
125 |
-
"eval_loss": 1.
|
126 |
-
"eval_runtime": 34.
|
127 |
-
"eval_samples_per_second": 1.
|
128 |
-
"eval_steps_per_second": 0.
|
129 |
"step": 80
|
130 |
},
|
131 |
{
|
132 |
"epoch": 10.0,
|
133 |
-
"grad_norm": 0.
|
134 |
-
"learning_rate":
|
135 |
-
"loss": 0.
|
136 |
"step": 90
|
137 |
},
|
138 |
{
|
139 |
"epoch": 10.0,
|
140 |
-
"eval_loss": 1.
|
141 |
-
"eval_runtime": 34.
|
142 |
-
"eval_samples_per_second": 1.
|
143 |
-
"eval_steps_per_second": 0.
|
144 |
"step": 90
|
145 |
},
|
146 |
{
|
147 |
"epoch": 11.11111111111111,
|
148 |
-
"grad_norm": 0.
|
149 |
-
"learning_rate":
|
150 |
-
"loss": 0.
|
151 |
"step": 100
|
152 |
},
|
153 |
{
|
154 |
"epoch": 11.11111111111111,
|
155 |
-
"eval_loss": 1.
|
156 |
-
"eval_runtime": 34.
|
157 |
-
"eval_samples_per_second": 1.
|
158 |
-
"eval_steps_per_second": 0.
|
159 |
"step": 100
|
160 |
}
|
161 |
],
|
162 |
"logging_steps": 10,
|
163 |
-
"max_steps":
|
164 |
"num_input_tokens_seen": 0,
|
165 |
-
"num_train_epochs":
|
166 |
"save_steps": 10,
|
167 |
"stateful_callbacks": {
|
168 |
"EarlyStoppingCallback": {
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.173593521118164,
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-90",
|
4 |
"epoch": 11.11111111111111,
|
5 |
"eval_steps": 10,
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
+
"grad_norm": 0.022457197308540344,
|
14 |
+
"learning_rate": 0.0001925925925925926,
|
15 |
+
"loss": 2.0406,
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
+
"eval_loss": 1.729261875152588,
|
21 |
+
"eval_runtime": 34.8953,
|
22 |
+
"eval_samples_per_second": 1.032,
|
23 |
+
"eval_steps_per_second": 0.143,
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
+
"grad_norm": 0.018787898123264313,
|
29 |
+
"learning_rate": 0.0001851851851851852,
|
30 |
+
"loss": 1.6016,
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
+
"eval_loss": 1.5362553596496582,
|
36 |
+
"eval_runtime": 34.8752,
|
37 |
+
"eval_samples_per_second": 1.032,
|
38 |
+
"eval_steps_per_second": 0.143,
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
+
"grad_norm": 0.021070128306746483,
|
44 |
+
"learning_rate": 0.00017777777777777779,
|
45 |
+
"loss": 1.3937,
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
+
"eval_loss": 1.4144253730773926,
|
51 |
+
"eval_runtime": 34.9429,
|
52 |
+
"eval_samples_per_second": 1.03,
|
53 |
+
"eval_steps_per_second": 0.143,
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
"epoch": 4.444444444444445,
|
58 |
+
"grad_norm": 0.037991978228092194,
|
59 |
+
"learning_rate": 0.00017037037037037037,
|
60 |
+
"loss": 1.2721,
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
"epoch": 4.444444444444445,
|
65 |
+
"eval_loss": 1.3360365629196167,
|
66 |
+
"eval_runtime": 34.8947,
|
67 |
+
"eval_samples_per_second": 1.032,
|
68 |
+
"eval_steps_per_second": 0.143,
|
69 |
"step": 40
|
70 |
},
|
71 |
{
|
72 |
"epoch": 5.555555555555555,
|
73 |
+
"grad_norm": 0.029117526486516,
|
74 |
+
"learning_rate": 0.00016296296296296295,
|
75 |
+
"loss": 1.1384,
|
76 |
"step": 50
|
77 |
},
|
78 |
{
|
79 |
"epoch": 5.555555555555555,
|
80 |
+
"eval_loss": 1.2785382270812988,
|
81 |
+
"eval_runtime": 34.8447,
|
82 |
+
"eval_samples_per_second": 1.033,
|
83 |
+
"eval_steps_per_second": 0.143,
|
84 |
"step": 50
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.666666666666667,
|
88 |
+
"grad_norm": 0.0317281112074852,
|
89 |
+
"learning_rate": 0.00015555555555555556,
|
90 |
+
"loss": 1.0023,
|
91 |
"step": 60
|
92 |
},
|
93 |
{
|
94 |
"epoch": 6.666666666666667,
|
95 |
+
"eval_loss": 1.2417998313903809,
|
96 |
+
"eval_runtime": 34.8141,
|
97 |
+
"eval_samples_per_second": 1.034,
|
98 |
+
"eval_steps_per_second": 0.144,
|
99 |
"step": 60
|
100 |
},
|
101 |
{
|
102 |
"epoch": 7.777777777777778,
|
103 |
+
"grad_norm": 0.034914035350084305,
|
104 |
+
"learning_rate": 0.00014814814814814815,
|
105 |
+
"loss": 0.9166,
|
106 |
"step": 70
|
107 |
},
|
108 |
{
|
109 |
"epoch": 7.777777777777778,
|
110 |
+
"eval_loss": 1.2166908979415894,
|
111 |
+
"eval_runtime": 34.8956,
|
112 |
+
"eval_samples_per_second": 1.032,
|
113 |
+
"eval_steps_per_second": 0.143,
|
114 |
"step": 70
|
115 |
},
|
116 |
{
|
117 |
"epoch": 8.88888888888889,
|
118 |
+
"grad_norm": 0.04872061312198639,
|
119 |
+
"learning_rate": 0.00014074074074074076,
|
120 |
+
"loss": 0.7726,
|
121 |
"step": 80
|
122 |
},
|
123 |
{
|
124 |
"epoch": 8.88888888888889,
|
125 |
+
"eval_loss": 1.19890296459198,
|
126 |
+
"eval_runtime": 34.8433,
|
127 |
+
"eval_samples_per_second": 1.033,
|
128 |
+
"eval_steps_per_second": 0.143,
|
129 |
"step": 80
|
130 |
},
|
131 |
{
|
132 |
"epoch": 10.0,
|
133 |
+
"grad_norm": 0.04901803284883499,
|
134 |
+
"learning_rate": 0.00013333333333333334,
|
135 |
+
"loss": 0.676,
|
136 |
"step": 90
|
137 |
},
|
138 |
{
|
139 |
"epoch": 10.0,
|
140 |
+
"eval_loss": 1.173593521118164,
|
141 |
+
"eval_runtime": 34.7999,
|
142 |
+
"eval_samples_per_second": 1.034,
|
143 |
+
"eval_steps_per_second": 0.144,
|
144 |
"step": 90
|
145 |
},
|
146 |
{
|
147 |
"epoch": 11.11111111111111,
|
148 |
+
"grad_norm": 0.055481575429439545,
|
149 |
+
"learning_rate": 0.00012592592592592592,
|
150 |
+
"loss": 0.56,
|
151 |
"step": 100
|
152 |
},
|
153 |
{
|
154 |
"epoch": 11.11111111111111,
|
155 |
+
"eval_loss": 1.2059063911437988,
|
156 |
+
"eval_runtime": 34.8432,
|
157 |
+
"eval_samples_per_second": 1.033,
|
158 |
+
"eval_steps_per_second": 0.143,
|
159 |
"step": 100
|
160 |
}
|
161 |
],
|
162 |
"logging_steps": 10,
|
163 |
+
"max_steps": 270,
|
164 |
"num_input_tokens_seen": 0,
|
165 |
+
"num_train_epochs": 30,
|
166 |
"save_steps": 10,
|
167 |
"stateful_callbacks": {
|
168 |
"EarlyStoppingCallback": {
|
checkpoint-100/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5112
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
|
3 |
size 5112
|
checkpoint-110/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
library_name: peft
|
3 |
base_model: TheBloke/Llama-2-7B-fp16
|
|
|
4 |
---
|
5 |
|
6 |
# Model Card for Model ID
|
|
|
1 |
---
|
|
|
2 |
base_model: TheBloke/Llama-2-7B-fp16
|
3 |
+
library_name: peft
|
4 |
---
|
5 |
|
6 |
# Model Card for Model ID
|
checkpoint-110/adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"
|
24 |
"v_proj",
|
25 |
-
"
|
26 |
-
"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
+
"q_proj",
|
24 |
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
checkpoint-110/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:144bb4cb915061effe137c30fcd2897134d3bad9790d3265733214a882cd96fa
|
3 |
size 67143296
|
checkpoint-110/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 134433530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e75cb352b366f1f8dcb73f6cbcd9937088a39b25a9a826d16c8594e055eea58
|
3 |
size 134433530
|
checkpoint-110/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ed11929090acc3d040cabb379d312daf9924e0e46fa0a1c8884a63973944e92
|
3 |
size 1064
|
checkpoint-110/trainer_state.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-90",
|
4 |
"epoch": 12.222222222222221,
|
5 |
"eval_steps": 10,
|
@@ -10,174 +10,174 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
-
"grad_norm": 0.
|
14 |
-
"learning_rate": 0.
|
15 |
-
"loss": 2.
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
-
"eval_loss": 1.
|
21 |
-
"eval_runtime": 34.
|
22 |
-
"eval_samples_per_second": 1.
|
23 |
-
"eval_steps_per_second": 0.
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
-
"grad_norm": 0.
|
29 |
-
"learning_rate": 0.
|
30 |
-
"loss": 1.
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
-
"eval_loss": 1.
|
36 |
-
"eval_runtime": 34.
|
37 |
-
"eval_samples_per_second": 1.
|
38 |
-
"eval_steps_per_second": 0.
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
-
"grad_norm": 0.
|
44 |
-
"learning_rate": 0.
|
45 |
-
"loss": 1.
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
-
"eval_loss": 1.
|
51 |
-
"eval_runtime": 34.
|
52 |
-
"eval_samples_per_second": 1.
|
53 |
-
"eval_steps_per_second": 0.
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
"epoch": 4.444444444444445,
|
58 |
-
"grad_norm": 0.
|
59 |
-
"learning_rate": 0.
|
60 |
-
"loss": 1.
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
"epoch": 4.444444444444445,
|
65 |
-
"eval_loss": 1.
|
66 |
-
"eval_runtime": 34.
|
67 |
-
"eval_samples_per_second": 1.
|
68 |
-
"eval_steps_per_second": 0.
|
69 |
"step": 40
|
70 |
},
|
71 |
{
|
72 |
"epoch": 5.555555555555555,
|
73 |
-
"grad_norm": 0.
|
74 |
-
"learning_rate": 0.
|
75 |
-
"loss": 1.
|
76 |
"step": 50
|
77 |
},
|
78 |
{
|
79 |
"epoch": 5.555555555555555,
|
80 |
-
"eval_loss": 1.
|
81 |
-
"eval_runtime": 34.
|
82 |
-
"eval_samples_per_second": 1.
|
83 |
-
"eval_steps_per_second": 0.
|
84 |
"step": 50
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.666666666666667,
|
88 |
-
"grad_norm": 0.
|
89 |
-
"learning_rate": 0.
|
90 |
-
"loss": 1.
|
91 |
"step": 60
|
92 |
},
|
93 |
{
|
94 |
"epoch": 6.666666666666667,
|
95 |
-
"eval_loss": 1.
|
96 |
-
"eval_runtime": 34.
|
97 |
-
"eval_samples_per_second": 1.
|
98 |
-
"eval_steps_per_second": 0.
|
99 |
"step": 60
|
100 |
},
|
101 |
{
|
102 |
"epoch": 7.777777777777778,
|
103 |
-
"grad_norm": 0.
|
104 |
-
"learning_rate":
|
105 |
-
"loss": 0.
|
106 |
"step": 70
|
107 |
},
|
108 |
{
|
109 |
"epoch": 7.777777777777778,
|
110 |
-
"eval_loss": 1.
|
111 |
-
"eval_runtime": 34.
|
112 |
-
"eval_samples_per_second": 1.
|
113 |
-
"eval_steps_per_second": 0.
|
114 |
"step": 70
|
115 |
},
|
116 |
{
|
117 |
"epoch": 8.88888888888889,
|
118 |
-
"grad_norm": 0.
|
119 |
-
"learning_rate":
|
120 |
-
"loss": 0.
|
121 |
"step": 80
|
122 |
},
|
123 |
{
|
124 |
"epoch": 8.88888888888889,
|
125 |
-
"eval_loss": 1.
|
126 |
-
"eval_runtime": 34.
|
127 |
-
"eval_samples_per_second": 1.
|
128 |
-
"eval_steps_per_second": 0.
|
129 |
"step": 80
|
130 |
},
|
131 |
{
|
132 |
"epoch": 10.0,
|
133 |
-
"grad_norm": 0.
|
134 |
-
"learning_rate":
|
135 |
-
"loss": 0.
|
136 |
"step": 90
|
137 |
},
|
138 |
{
|
139 |
"epoch": 10.0,
|
140 |
-
"eval_loss": 1.
|
141 |
-
"eval_runtime": 34.
|
142 |
-
"eval_samples_per_second": 1.
|
143 |
-
"eval_steps_per_second": 0.
|
144 |
"step": 90
|
145 |
},
|
146 |
{
|
147 |
"epoch": 11.11111111111111,
|
148 |
-
"grad_norm": 0.
|
149 |
-
"learning_rate":
|
150 |
-
"loss": 0.
|
151 |
"step": 100
|
152 |
},
|
153 |
{
|
154 |
"epoch": 11.11111111111111,
|
155 |
-
"eval_loss": 1.
|
156 |
-
"eval_runtime": 34.
|
157 |
-
"eval_samples_per_second": 1.
|
158 |
-
"eval_steps_per_second": 0.
|
159 |
"step": 100
|
160 |
},
|
161 |
{
|
162 |
"epoch": 12.222222222222221,
|
163 |
-
"grad_norm": 0.
|
164 |
-
"learning_rate":
|
165 |
-
"loss": 0.
|
166 |
"step": 110
|
167 |
},
|
168 |
{
|
169 |
"epoch": 12.222222222222221,
|
170 |
-
"eval_loss": 1.
|
171 |
-
"eval_runtime": 34.
|
172 |
-
"eval_samples_per_second": 1.
|
173 |
"eval_steps_per_second": 0.144,
|
174 |
"step": 110
|
175 |
}
|
176 |
],
|
177 |
"logging_steps": 10,
|
178 |
-
"max_steps":
|
179 |
"num_input_tokens_seen": 0,
|
180 |
-
"num_train_epochs":
|
181 |
"save_steps": 10,
|
182 |
"stateful_callbacks": {
|
183 |
"EarlyStoppingCallback": {
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.173593521118164,
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-90",
|
4 |
"epoch": 12.222222222222221,
|
5 |
"eval_steps": 10,
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
+
"grad_norm": 0.022457197308540344,
|
14 |
+
"learning_rate": 0.0001925925925925926,
|
15 |
+
"loss": 2.0406,
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
+
"eval_loss": 1.729261875152588,
|
21 |
+
"eval_runtime": 34.8953,
|
22 |
+
"eval_samples_per_second": 1.032,
|
23 |
+
"eval_steps_per_second": 0.143,
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
+
"grad_norm": 0.018787898123264313,
|
29 |
+
"learning_rate": 0.0001851851851851852,
|
30 |
+
"loss": 1.6016,
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
+
"eval_loss": 1.5362553596496582,
|
36 |
+
"eval_runtime": 34.8752,
|
37 |
+
"eval_samples_per_second": 1.032,
|
38 |
+
"eval_steps_per_second": 0.143,
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
+
"grad_norm": 0.021070128306746483,
|
44 |
+
"learning_rate": 0.00017777777777777779,
|
45 |
+
"loss": 1.3937,
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
+
"eval_loss": 1.4144253730773926,
|
51 |
+
"eval_runtime": 34.9429,
|
52 |
+
"eval_samples_per_second": 1.03,
|
53 |
+
"eval_steps_per_second": 0.143,
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
"epoch": 4.444444444444445,
|
58 |
+
"grad_norm": 0.037991978228092194,
|
59 |
+
"learning_rate": 0.00017037037037037037,
|
60 |
+
"loss": 1.2721,
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
"epoch": 4.444444444444445,
|
65 |
+
"eval_loss": 1.3360365629196167,
|
66 |
+
"eval_runtime": 34.8947,
|
67 |
+
"eval_samples_per_second": 1.032,
|
68 |
+
"eval_steps_per_second": 0.143,
|
69 |
"step": 40
|
70 |
},
|
71 |
{
|
72 |
"epoch": 5.555555555555555,
|
73 |
+
"grad_norm": 0.029117526486516,
|
74 |
+
"learning_rate": 0.00016296296296296295,
|
75 |
+
"loss": 1.1384,
|
76 |
"step": 50
|
77 |
},
|
78 |
{
|
79 |
"epoch": 5.555555555555555,
|
80 |
+
"eval_loss": 1.2785382270812988,
|
81 |
+
"eval_runtime": 34.8447,
|
82 |
+
"eval_samples_per_second": 1.033,
|
83 |
+
"eval_steps_per_second": 0.143,
|
84 |
"step": 50
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.666666666666667,
|
88 |
+
"grad_norm": 0.0317281112074852,
|
89 |
+
"learning_rate": 0.00015555555555555556,
|
90 |
+
"loss": 1.0023,
|
91 |
"step": 60
|
92 |
},
|
93 |
{
|
94 |
"epoch": 6.666666666666667,
|
95 |
+
"eval_loss": 1.2417998313903809,
|
96 |
+
"eval_runtime": 34.8141,
|
97 |
+
"eval_samples_per_second": 1.034,
|
98 |
+
"eval_steps_per_second": 0.144,
|
99 |
"step": 60
|
100 |
},
|
101 |
{
|
102 |
"epoch": 7.777777777777778,
|
103 |
+
"grad_norm": 0.034914035350084305,
|
104 |
+
"learning_rate": 0.00014814814814814815,
|
105 |
+
"loss": 0.9166,
|
106 |
"step": 70
|
107 |
},
|
108 |
{
|
109 |
"epoch": 7.777777777777778,
|
110 |
+
"eval_loss": 1.2166908979415894,
|
111 |
+
"eval_runtime": 34.8956,
|
112 |
+
"eval_samples_per_second": 1.032,
|
113 |
+
"eval_steps_per_second": 0.143,
|
114 |
"step": 70
|
115 |
},
|
116 |
{
|
117 |
"epoch": 8.88888888888889,
|
118 |
+
"grad_norm": 0.04872061312198639,
|
119 |
+
"learning_rate": 0.00014074074074074076,
|
120 |
+
"loss": 0.7726,
|
121 |
"step": 80
|
122 |
},
|
123 |
{
|
124 |
"epoch": 8.88888888888889,
|
125 |
+
"eval_loss": 1.19890296459198,
|
126 |
+
"eval_runtime": 34.8433,
|
127 |
+
"eval_samples_per_second": 1.033,
|
128 |
+
"eval_steps_per_second": 0.143,
|
129 |
"step": 80
|
130 |
},
|
131 |
{
|
132 |
"epoch": 10.0,
|
133 |
+
"grad_norm": 0.04901803284883499,
|
134 |
+
"learning_rate": 0.00013333333333333334,
|
135 |
+
"loss": 0.676,
|
136 |
"step": 90
|
137 |
},
|
138 |
{
|
139 |
"epoch": 10.0,
|
140 |
+
"eval_loss": 1.173593521118164,
|
141 |
+
"eval_runtime": 34.7999,
|
142 |
+
"eval_samples_per_second": 1.034,
|
143 |
+
"eval_steps_per_second": 0.144,
|
144 |
"step": 90
|
145 |
},
|
146 |
{
|
147 |
"epoch": 11.11111111111111,
|
148 |
+
"grad_norm": 0.055481575429439545,
|
149 |
+
"learning_rate": 0.00012592592592592592,
|
150 |
+
"loss": 0.56,
|
151 |
"step": 100
|
152 |
},
|
153 |
{
|
154 |
"epoch": 11.11111111111111,
|
155 |
+
"eval_loss": 1.2059063911437988,
|
156 |
+
"eval_runtime": 34.8432,
|
157 |
+
"eval_samples_per_second": 1.033,
|
158 |
+
"eval_steps_per_second": 0.143,
|
159 |
"step": 100
|
160 |
},
|
161 |
{
|
162 |
"epoch": 12.222222222222221,
|
163 |
+
"grad_norm": 0.0524757020175457,
|
164 |
+
"learning_rate": 0.00011851851851851852,
|
165 |
+
"loss": 0.4567,
|
166 |
"step": 110
|
167 |
},
|
168 |
{
|
169 |
"epoch": 12.222222222222221,
|
170 |
+
"eval_loss": 1.2077444791793823,
|
171 |
+
"eval_runtime": 34.7989,
|
172 |
+
"eval_samples_per_second": 1.035,
|
173 |
"eval_steps_per_second": 0.144,
|
174 |
"step": 110
|
175 |
}
|
176 |
],
|
177 |
"logging_steps": 10,
|
178 |
+
"max_steps": 270,
|
179 |
"num_input_tokens_seen": 0,
|
180 |
+
"num_train_epochs": 30,
|
181 |
"save_steps": 10,
|
182 |
"stateful_callbacks": {
|
183 |
"EarlyStoppingCallback": {
|
checkpoint-110/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5112
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
|
3 |
size 5112
|
checkpoint-120/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
library_name: peft
|
3 |
base_model: TheBloke/Llama-2-7B-fp16
|
|
|
4 |
---
|
5 |
|
6 |
# Model Card for Model ID
|
|
|
1 |
---
|
|
|
2 |
base_model: TheBloke/Llama-2-7B-fp16
|
3 |
+
library_name: peft
|
4 |
---
|
5 |
|
6 |
# Model Card for Model ID
|
checkpoint-120/adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"
|
24 |
"v_proj",
|
25 |
-
"
|
26 |
-
"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
+
"q_proj",
|
24 |
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
checkpoint-120/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:173890a91bffdb964ce5f909803d70349ae54ba2275b81eaf33d7e10b02d2a18
|
3 |
size 67143296
|
checkpoint-120/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 134433530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50d2389c285ac068205e38a94cd027c8b55b17736442e923e49875d92296c9dd
|
3 |
size 134433530
|
checkpoint-120/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60da1fb7525eb9f93843e0f6cf6e45c012533f0f97597344050ff835287f782f
|
3 |
size 1064
|
checkpoint-120/trainer_state.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
-
"best_model_checkpoint": "/kaggle/working/checkpoint-
|
4 |
"epoch": 13.333333333333334,
|
5 |
"eval_steps": 10,
|
6 |
"global_step": 120,
|
@@ -10,189 +10,189 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
-
"grad_norm": 0.
|
14 |
-
"learning_rate": 0.
|
15 |
-
"loss": 2.
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
-
"eval_loss": 1.
|
21 |
-
"eval_runtime": 34.
|
22 |
-
"eval_samples_per_second": 1.
|
23 |
-
"eval_steps_per_second": 0.
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
-
"grad_norm": 0.
|
29 |
-
"learning_rate": 0.
|
30 |
-
"loss": 1.
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
-
"eval_loss": 1.
|
36 |
-
"eval_runtime": 34.
|
37 |
-
"eval_samples_per_second": 1.
|
38 |
-
"eval_steps_per_second": 0.
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
-
"grad_norm": 0.
|
44 |
-
"learning_rate": 0.
|
45 |
-
"loss": 1.
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
-
"eval_loss": 1.
|
51 |
-
"eval_runtime": 34.
|
52 |
-
"eval_samples_per_second": 1.
|
53 |
-
"eval_steps_per_second": 0.
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
"epoch": 4.444444444444445,
|
58 |
-
"grad_norm": 0.
|
59 |
-
"learning_rate": 0.
|
60 |
-
"loss": 1.
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
"epoch": 4.444444444444445,
|
65 |
-
"eval_loss": 1.
|
66 |
-
"eval_runtime": 34.
|
67 |
-
"eval_samples_per_second": 1.
|
68 |
-
"eval_steps_per_second": 0.
|
69 |
"step": 40
|
70 |
},
|
71 |
{
|
72 |
"epoch": 5.555555555555555,
|
73 |
-
"grad_norm": 0.
|
74 |
-
"learning_rate": 0.
|
75 |
-
"loss": 1.
|
76 |
"step": 50
|
77 |
},
|
78 |
{
|
79 |
"epoch": 5.555555555555555,
|
80 |
-
"eval_loss": 1.
|
81 |
-
"eval_runtime": 34.
|
82 |
-
"eval_samples_per_second": 1.
|
83 |
-
"eval_steps_per_second": 0.
|
84 |
"step": 50
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.666666666666667,
|
88 |
-
"grad_norm": 0.
|
89 |
-
"learning_rate": 0.
|
90 |
-
"loss": 1.
|
91 |
"step": 60
|
92 |
},
|
93 |
{
|
94 |
"epoch": 6.666666666666667,
|
95 |
-
"eval_loss": 1.
|
96 |
-
"eval_runtime": 34.
|
97 |
-
"eval_samples_per_second": 1.
|
98 |
-
"eval_steps_per_second": 0.
|
99 |
"step": 60
|
100 |
},
|
101 |
{
|
102 |
"epoch": 7.777777777777778,
|
103 |
-
"grad_norm": 0.
|
104 |
-
"learning_rate":
|
105 |
-
"loss": 0.
|
106 |
"step": 70
|
107 |
},
|
108 |
{
|
109 |
"epoch": 7.777777777777778,
|
110 |
-
"eval_loss": 1.
|
111 |
-
"eval_runtime": 34.
|
112 |
-
"eval_samples_per_second": 1.
|
113 |
-
"eval_steps_per_second": 0.
|
114 |
"step": 70
|
115 |
},
|
116 |
{
|
117 |
"epoch": 8.88888888888889,
|
118 |
-
"grad_norm": 0.
|
119 |
-
"learning_rate":
|
120 |
-
"loss": 0.
|
121 |
"step": 80
|
122 |
},
|
123 |
{
|
124 |
"epoch": 8.88888888888889,
|
125 |
-
"eval_loss": 1.
|
126 |
-
"eval_runtime": 34.
|
127 |
-
"eval_samples_per_second": 1.
|
128 |
-
"eval_steps_per_second": 0.
|
129 |
"step": 80
|
130 |
},
|
131 |
{
|
132 |
"epoch": 10.0,
|
133 |
-
"grad_norm": 0.
|
134 |
-
"learning_rate":
|
135 |
-
"loss": 0.
|
136 |
"step": 90
|
137 |
},
|
138 |
{
|
139 |
"epoch": 10.0,
|
140 |
-
"eval_loss": 1.
|
141 |
-
"eval_runtime": 34.
|
142 |
-
"eval_samples_per_second": 1.
|
143 |
-
"eval_steps_per_second": 0.
|
144 |
"step": 90
|
145 |
},
|
146 |
{
|
147 |
"epoch": 11.11111111111111,
|
148 |
-
"grad_norm": 0.
|
149 |
-
"learning_rate":
|
150 |
-
"loss": 0.
|
151 |
"step": 100
|
152 |
},
|
153 |
{
|
154 |
"epoch": 11.11111111111111,
|
155 |
-
"eval_loss": 1.
|
156 |
-
"eval_runtime": 34.
|
157 |
-
"eval_samples_per_second": 1.
|
158 |
-
"eval_steps_per_second": 0.
|
159 |
"step": 100
|
160 |
},
|
161 |
{
|
162 |
"epoch": 12.222222222222221,
|
163 |
-
"grad_norm": 0.
|
164 |
-
"learning_rate":
|
165 |
-
"loss": 0.
|
166 |
"step": 110
|
167 |
},
|
168 |
{
|
169 |
"epoch": 12.222222222222221,
|
170 |
-
"eval_loss": 1.
|
171 |
-
"eval_runtime": 34.
|
172 |
-
"eval_samples_per_second": 1.
|
173 |
"eval_steps_per_second": 0.144,
|
174 |
"step": 110
|
175 |
},
|
176 |
{
|
177 |
"epoch": 13.333333333333334,
|
178 |
-
"grad_norm": 0.
|
179 |
-
"learning_rate":
|
180 |
-
"loss": 0.
|
181 |
"step": 120
|
182 |
},
|
183 |
{
|
184 |
"epoch": 13.333333333333334,
|
185 |
-
"eval_loss": 1.
|
186 |
-
"eval_runtime": 34.
|
187 |
-
"eval_samples_per_second": 1.
|
188 |
"eval_steps_per_second": 0.144,
|
189 |
"step": 120
|
190 |
}
|
191 |
],
|
192 |
"logging_steps": 10,
|
193 |
-
"max_steps":
|
194 |
"num_input_tokens_seen": 0,
|
195 |
-
"num_train_epochs":
|
196 |
"save_steps": 10,
|
197 |
"stateful_callbacks": {
|
198 |
"EarlyStoppingCallback": {
|
@@ -210,7 +210,7 @@
|
|
210 |
"should_evaluate": false,
|
211 |
"should_log": false,
|
212 |
"should_save": true,
|
213 |
-
"should_training_stop":
|
214 |
},
|
215 |
"attributes": {}
|
216 |
}
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.173593521118164,
|
3 |
+
"best_model_checkpoint": "/kaggle/working/checkpoint-90",
|
4 |
"epoch": 13.333333333333334,
|
5 |
"eval_steps": 10,
|
6 |
"global_step": 120,
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
+
"grad_norm": 0.022457197308540344,
|
14 |
+
"learning_rate": 0.0001925925925925926,
|
15 |
+
"loss": 2.0406,
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
+
"eval_loss": 1.729261875152588,
|
21 |
+
"eval_runtime": 34.8953,
|
22 |
+
"eval_samples_per_second": 1.032,
|
23 |
+
"eval_steps_per_second": 0.143,
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
+
"grad_norm": 0.018787898123264313,
|
29 |
+
"learning_rate": 0.0001851851851851852,
|
30 |
+
"loss": 1.6016,
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
+
"eval_loss": 1.5362553596496582,
|
36 |
+
"eval_runtime": 34.8752,
|
37 |
+
"eval_samples_per_second": 1.032,
|
38 |
+
"eval_steps_per_second": 0.143,
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
+
"grad_norm": 0.021070128306746483,
|
44 |
+
"learning_rate": 0.00017777777777777779,
|
45 |
+
"loss": 1.3937,
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
+
"eval_loss": 1.4144253730773926,
|
51 |
+
"eval_runtime": 34.9429,
|
52 |
+
"eval_samples_per_second": 1.03,
|
53 |
+
"eval_steps_per_second": 0.143,
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
"epoch": 4.444444444444445,
|
58 |
+
"grad_norm": 0.037991978228092194,
|
59 |
+
"learning_rate": 0.00017037037037037037,
|
60 |
+
"loss": 1.2721,
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
"epoch": 4.444444444444445,
|
65 |
+
"eval_loss": 1.3360365629196167,
|
66 |
+
"eval_runtime": 34.8947,
|
67 |
+
"eval_samples_per_second": 1.032,
|
68 |
+
"eval_steps_per_second": 0.143,
|
69 |
"step": 40
|
70 |
},
|
71 |
{
|
72 |
"epoch": 5.555555555555555,
|
73 |
+
"grad_norm": 0.029117526486516,
|
74 |
+
"learning_rate": 0.00016296296296296295,
|
75 |
+
"loss": 1.1384,
|
76 |
"step": 50
|
77 |
},
|
78 |
{
|
79 |
"epoch": 5.555555555555555,
|
80 |
+
"eval_loss": 1.2785382270812988,
|
81 |
+
"eval_runtime": 34.8447,
|
82 |
+
"eval_samples_per_second": 1.033,
|
83 |
+
"eval_steps_per_second": 0.143,
|
84 |
"step": 50
|
85 |
},
|
86 |
{
|
87 |
"epoch": 6.666666666666667,
|
88 |
+
"grad_norm": 0.0317281112074852,
|
89 |
+
"learning_rate": 0.00015555555555555556,
|
90 |
+
"loss": 1.0023,
|
91 |
"step": 60
|
92 |
},
|
93 |
{
|
94 |
"epoch": 6.666666666666667,
|
95 |
+
"eval_loss": 1.2417998313903809,
|
96 |
+
"eval_runtime": 34.8141,
|
97 |
+
"eval_samples_per_second": 1.034,
|
98 |
+
"eval_steps_per_second": 0.144,
|
99 |
"step": 60
|
100 |
},
|
101 |
{
|
102 |
"epoch": 7.777777777777778,
|
103 |
+
"grad_norm": 0.034914035350084305,
|
104 |
+
"learning_rate": 0.00014814814814814815,
|
105 |
+
"loss": 0.9166,
|
106 |
"step": 70
|
107 |
},
|
108 |
{
|
109 |
"epoch": 7.777777777777778,
|
110 |
+
"eval_loss": 1.2166908979415894,
|
111 |
+
"eval_runtime": 34.8956,
|
112 |
+
"eval_samples_per_second": 1.032,
|
113 |
+
"eval_steps_per_second": 0.143,
|
114 |
"step": 70
|
115 |
},
|
116 |
{
|
117 |
"epoch": 8.88888888888889,
|
118 |
+
"grad_norm": 0.04872061312198639,
|
119 |
+
"learning_rate": 0.00014074074074074076,
|
120 |
+
"loss": 0.7726,
|
121 |
"step": 80
|
122 |
},
|
123 |
{
|
124 |
"epoch": 8.88888888888889,
|
125 |
+
"eval_loss": 1.19890296459198,
|
126 |
+
"eval_runtime": 34.8433,
|
127 |
+
"eval_samples_per_second": 1.033,
|
128 |
+
"eval_steps_per_second": 0.143,
|
129 |
"step": 80
|
130 |
},
|
131 |
{
|
132 |
"epoch": 10.0,
|
133 |
+
"grad_norm": 0.04901803284883499,
|
134 |
+
"learning_rate": 0.00013333333333333334,
|
135 |
+
"loss": 0.676,
|
136 |
"step": 90
|
137 |
},
|
138 |
{
|
139 |
"epoch": 10.0,
|
140 |
+
"eval_loss": 1.173593521118164,
|
141 |
+
"eval_runtime": 34.7999,
|
142 |
+
"eval_samples_per_second": 1.034,
|
143 |
+
"eval_steps_per_second": 0.144,
|
144 |
"step": 90
|
145 |
},
|
146 |
{
|
147 |
"epoch": 11.11111111111111,
|
148 |
+
"grad_norm": 0.055481575429439545,
|
149 |
+
"learning_rate": 0.00012592592592592592,
|
150 |
+
"loss": 0.56,
|
151 |
"step": 100
|
152 |
},
|
153 |
{
|
154 |
"epoch": 11.11111111111111,
|
155 |
+
"eval_loss": 1.2059063911437988,
|
156 |
+
"eval_runtime": 34.8432,
|
157 |
+
"eval_samples_per_second": 1.033,
|
158 |
+
"eval_steps_per_second": 0.143,
|
159 |
"step": 100
|
160 |
},
|
161 |
{
|
162 |
"epoch": 12.222222222222221,
|
163 |
+
"grad_norm": 0.0524757020175457,
|
164 |
+
"learning_rate": 0.00011851851851851852,
|
165 |
+
"loss": 0.4567,
|
166 |
"step": 110
|
167 |
},
|
168 |
{
|
169 |
"epoch": 12.222222222222221,
|
170 |
+
"eval_loss": 1.2077444791793823,
|
171 |
+
"eval_runtime": 34.7989,
|
172 |
+
"eval_samples_per_second": 1.035,
|
173 |
"eval_steps_per_second": 0.144,
|
174 |
"step": 110
|
175 |
},
|
176 |
{
|
177 |
"epoch": 13.333333333333334,
|
178 |
+
"grad_norm": 0.053020887076854706,
|
179 |
+
"learning_rate": 0.00011111111111111112,
|
180 |
+
"loss": 0.3915,
|
181 |
"step": 120
|
182 |
},
|
183 |
{
|
184 |
"epoch": 13.333333333333334,
|
185 |
+
"eval_loss": 1.2036480903625488,
|
186 |
+
"eval_runtime": 34.802,
|
187 |
+
"eval_samples_per_second": 1.034,
|
188 |
"eval_steps_per_second": 0.144,
|
189 |
"step": 120
|
190 |
}
|
191 |
],
|
192 |
"logging_steps": 10,
|
193 |
+
"max_steps": 270,
|
194 |
"num_input_tokens_seen": 0,
|
195 |
+
"num_train_epochs": 30,
|
196 |
"save_steps": 10,
|
197 |
"stateful_callbacks": {
|
198 |
"EarlyStoppingCallback": {
|
|
|
210 |
"should_evaluate": false,
|
211 |
"should_log": false,
|
212 |
"should_save": true,
|
213 |
+
"should_training_stop": true
|
214 |
},
|
215 |
"attributes": {}
|
216 |
}
|
checkpoint-120/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5112
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
|
3 |
size 5112
|
checkpoint-20/adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"k_proj",
|
24 |
"q_proj",
|
25 |
-
"
|
26 |
-
"
|
|
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
checkpoint-20/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e350a043d6e188be3930ae109597f7418b1c57332d8722c377acf61b839280db
|
3 |
size 67143296
|
checkpoint-20/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 134433530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53aaa85798d6640d73ad6607ea99ffb3eee0b87eb9130f5f653e9d52f119e393
|
3 |
size 134433530
|
checkpoint-20/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c05a2d1065158cc7891c3a0806de9d2368277087e1ac23c872a14cc5ce6a082
|
3 |
size 1064
|
checkpoint-20/trainer_state.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-20",
|
4 |
"epoch": 2.2222222222222223,
|
5 |
"eval_steps": 10,
|
@@ -10,39 +10,39 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
-
"grad_norm": 0.
|
14 |
-
"learning_rate": 0.
|
15 |
-
"loss": 2.
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
-
"eval_loss": 1.
|
21 |
-
"eval_runtime":
|
22 |
-
"eval_samples_per_second": 1.
|
23 |
-
"eval_steps_per_second": 0.
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
-
"grad_norm": 0.
|
29 |
-
"learning_rate": 0.
|
30 |
-
"loss": 1.
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
-
"eval_loss": 1.
|
36 |
-
"eval_runtime": 34.
|
37 |
-
"eval_samples_per_second": 1.
|
38 |
-
"eval_steps_per_second": 0.
|
39 |
"step": 20
|
40 |
}
|
41 |
],
|
42 |
"logging_steps": 10,
|
43 |
-
"max_steps":
|
44 |
"num_input_tokens_seen": 0,
|
45 |
-
"num_train_epochs":
|
46 |
"save_steps": 10,
|
47 |
"stateful_callbacks": {
|
48 |
"EarlyStoppingCallback": {
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.5362553596496582,
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-20",
|
4 |
"epoch": 2.2222222222222223,
|
5 |
"eval_steps": 10,
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
+
"grad_norm": 0.022457197308540344,
|
14 |
+
"learning_rate": 0.0001925925925925926,
|
15 |
+
"loss": 2.0406,
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
+
"eval_loss": 1.729261875152588,
|
21 |
+
"eval_runtime": 34.8953,
|
22 |
+
"eval_samples_per_second": 1.032,
|
23 |
+
"eval_steps_per_second": 0.143,
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
+
"grad_norm": 0.018787898123264313,
|
29 |
+
"learning_rate": 0.0001851851851851852,
|
30 |
+
"loss": 1.6016,
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
+
"eval_loss": 1.5362553596496582,
|
36 |
+
"eval_runtime": 34.8752,
|
37 |
+
"eval_samples_per_second": 1.032,
|
38 |
+
"eval_steps_per_second": 0.143,
|
39 |
"step": 20
|
40 |
}
|
41 |
],
|
42 |
"logging_steps": 10,
|
43 |
+
"max_steps": 270,
|
44 |
"num_input_tokens_seen": 0,
|
45 |
+
"num_train_epochs": 30,
|
46 |
"save_steps": 10,
|
47 |
"stateful_callbacks": {
|
48 |
"EarlyStoppingCallback": {
|
checkpoint-20/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5112
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
|
3 |
size 5112
|
checkpoint-30/adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"k_proj",
|
24 |
"q_proj",
|
25 |
-
"
|
26 |
-
"
|
|
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
checkpoint-30/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5afb85b43418de4f387e019cd5ff83db304f24c87600e2deb9b497bc225833e
|
3 |
size 67143296
|
checkpoint-30/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 134433530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d9d88f9eafcc9de7708775af55f4954ceecfc02ad0285772a7592b7f07336a6c
|
3 |
size 134433530
|
checkpoint-30/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78882a59797d983394328068e814e7aad08e194b72ebc7003618cfb9ff129ecf
|
3 |
size 1064
|
checkpoint-30/trainer_state.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-30",
|
4 |
"epoch": 3.3333333333333335,
|
5 |
"eval_steps": 10,
|
@@ -10,54 +10,54 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
-
"grad_norm": 0.
|
14 |
-
"learning_rate": 0.
|
15 |
-
"loss": 2.
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
-
"eval_loss": 1.
|
21 |
-
"eval_runtime":
|
22 |
-
"eval_samples_per_second": 1.
|
23 |
-
"eval_steps_per_second": 0.
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
-
"grad_norm": 0.
|
29 |
-
"learning_rate": 0.
|
30 |
-
"loss": 1.
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
-
"eval_loss": 1.
|
36 |
-
"eval_runtime": 34.
|
37 |
-
"eval_samples_per_second": 1.
|
38 |
-
"eval_steps_per_second": 0.
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
-
"grad_norm": 0.
|
44 |
-
"learning_rate": 0.
|
45 |
-
"loss": 1.
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
-
"eval_loss": 1.
|
51 |
-
"eval_runtime": 34.
|
52 |
-
"eval_samples_per_second": 1.
|
53 |
"eval_steps_per_second": 0.143,
|
54 |
"step": 30
|
55 |
}
|
56 |
],
|
57 |
"logging_steps": 10,
|
58 |
-
"max_steps":
|
59 |
"num_input_tokens_seen": 0,
|
60 |
-
"num_train_epochs":
|
61 |
"save_steps": 10,
|
62 |
"stateful_callbacks": {
|
63 |
"EarlyStoppingCallback": {
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.4144253730773926,
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-30",
|
4 |
"epoch": 3.3333333333333335,
|
5 |
"eval_steps": 10,
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
+
"grad_norm": 0.022457197308540344,
|
14 |
+
"learning_rate": 0.0001925925925925926,
|
15 |
+
"loss": 2.0406,
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
+
"eval_loss": 1.729261875152588,
|
21 |
+
"eval_runtime": 34.8953,
|
22 |
+
"eval_samples_per_second": 1.032,
|
23 |
+
"eval_steps_per_second": 0.143,
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
+
"grad_norm": 0.018787898123264313,
|
29 |
+
"learning_rate": 0.0001851851851851852,
|
30 |
+
"loss": 1.6016,
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
+
"eval_loss": 1.5362553596496582,
|
36 |
+
"eval_runtime": 34.8752,
|
37 |
+
"eval_samples_per_second": 1.032,
|
38 |
+
"eval_steps_per_second": 0.143,
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
+
"grad_norm": 0.021070128306746483,
|
44 |
+
"learning_rate": 0.00017777777777777779,
|
45 |
+
"loss": 1.3937,
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
+
"eval_loss": 1.4144253730773926,
|
51 |
+
"eval_runtime": 34.9429,
|
52 |
+
"eval_samples_per_second": 1.03,
|
53 |
"eval_steps_per_second": 0.143,
|
54 |
"step": 30
|
55 |
}
|
56 |
],
|
57 |
"logging_steps": 10,
|
58 |
+
"max_steps": 270,
|
59 |
"num_input_tokens_seen": 0,
|
60 |
+
"num_train_epochs": 30,
|
61 |
"save_steps": 10,
|
62 |
"stateful_callbacks": {
|
63 |
"EarlyStoppingCallback": {
|
checkpoint-30/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5112
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
|
3 |
size 5112
|
checkpoint-40/adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"k_proj",
|
24 |
"q_proj",
|
25 |
-
"
|
26 |
-
"
|
|
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
checkpoint-40/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c9e0b673563a407b5292005048d7f9e55e28f761356c6b7d865a6f14dbd4d1f
|
3 |
size 67143296
|
checkpoint-40/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 134433530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7af07c81b6d2394145239563c1fcabf8a96f4c073bcbe06adbe0e38de3e745d4
|
3 |
size 134433530
|
checkpoint-40/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:441aaa824ef89ae7e5156933dc6dbe413f7c295a974e1ca6e7641ce94bd233fa
|
3 |
size 1064
|
checkpoint-40/trainer_state.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"best_metric": 1.
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-40",
|
4 |
"epoch": 4.444444444444445,
|
5 |
"eval_steps": 10,
|
@@ -10,69 +10,69 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
-
"grad_norm": 0.
|
14 |
-
"learning_rate": 0.
|
15 |
-
"loss": 2.
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
-
"eval_loss": 1.
|
21 |
-
"eval_runtime":
|
22 |
-
"eval_samples_per_second": 1.
|
23 |
-
"eval_steps_per_second": 0.
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
-
"grad_norm": 0.
|
29 |
-
"learning_rate": 0.
|
30 |
-
"loss": 1.
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
-
"eval_loss": 1.
|
36 |
-
"eval_runtime": 34.
|
37 |
-
"eval_samples_per_second": 1.
|
38 |
-
"eval_steps_per_second": 0.
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
-
"grad_norm": 0.
|
44 |
-
"learning_rate": 0.
|
45 |
-
"loss": 1.
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
-
"eval_loss": 1.
|
51 |
-
"eval_runtime": 34.
|
52 |
-
"eval_samples_per_second": 1.
|
53 |
"eval_steps_per_second": 0.143,
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
"epoch": 4.444444444444445,
|
58 |
-
"grad_norm": 0.
|
59 |
-
"learning_rate": 0.
|
60 |
-
"loss": 1.
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
"epoch": 4.444444444444445,
|
65 |
-
"eval_loss": 1.
|
66 |
-
"eval_runtime":
|
67 |
-
"eval_samples_per_second": 1.
|
68 |
"eval_steps_per_second": 0.143,
|
69 |
"step": 40
|
70 |
}
|
71 |
],
|
72 |
"logging_steps": 10,
|
73 |
-
"max_steps":
|
74 |
"num_input_tokens_seen": 0,
|
75 |
-
"num_train_epochs":
|
76 |
"save_steps": 10,
|
77 |
"stateful_callbacks": {
|
78 |
"EarlyStoppingCallback": {
|
|
|
1 |
{
|
2 |
+
"best_metric": 1.3360365629196167,
|
3 |
"best_model_checkpoint": "/kaggle/working/checkpoint-40",
|
4 |
"epoch": 4.444444444444445,
|
5 |
"eval_steps": 10,
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.1111111111111112,
|
13 |
+
"grad_norm": 0.022457197308540344,
|
14 |
+
"learning_rate": 0.0001925925925925926,
|
15 |
+
"loss": 2.0406,
|
16 |
"step": 10
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.1111111111111112,
|
20 |
+
"eval_loss": 1.729261875152588,
|
21 |
+
"eval_runtime": 34.8953,
|
22 |
+
"eval_samples_per_second": 1.032,
|
23 |
+
"eval_steps_per_second": 0.143,
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
"epoch": 2.2222222222222223,
|
28 |
+
"grad_norm": 0.018787898123264313,
|
29 |
+
"learning_rate": 0.0001851851851851852,
|
30 |
+
"loss": 1.6016,
|
31 |
"step": 20
|
32 |
},
|
33 |
{
|
34 |
"epoch": 2.2222222222222223,
|
35 |
+
"eval_loss": 1.5362553596496582,
|
36 |
+
"eval_runtime": 34.8752,
|
37 |
+
"eval_samples_per_second": 1.032,
|
38 |
+
"eval_steps_per_second": 0.143,
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
"epoch": 3.3333333333333335,
|
43 |
+
"grad_norm": 0.021070128306746483,
|
44 |
+
"learning_rate": 0.00017777777777777779,
|
45 |
+
"loss": 1.3937,
|
46 |
"step": 30
|
47 |
},
|
48 |
{
|
49 |
"epoch": 3.3333333333333335,
|
50 |
+
"eval_loss": 1.4144253730773926,
|
51 |
+
"eval_runtime": 34.9429,
|
52 |
+
"eval_samples_per_second": 1.03,
|
53 |
"eval_steps_per_second": 0.143,
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
"epoch": 4.444444444444445,
|
58 |
+
"grad_norm": 0.037991978228092194,
|
59 |
+
"learning_rate": 0.00017037037037037037,
|
60 |
+
"loss": 1.2721,
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
"epoch": 4.444444444444445,
|
65 |
+
"eval_loss": 1.3360365629196167,
|
66 |
+
"eval_runtime": 34.8947,
|
67 |
+
"eval_samples_per_second": 1.032,
|
68 |
"eval_steps_per_second": 0.143,
|
69 |
"step": 40
|
70 |
}
|
71 |
],
|
72 |
"logging_steps": 10,
|
73 |
+
"max_steps": 270,
|
74 |
"num_input_tokens_seen": 0,
|
75 |
+
"num_train_epochs": 30,
|
76 |
"save_steps": 10,
|
77 |
"stateful_callbacks": {
|
78 |
"EarlyStoppingCallback": {
|
checkpoint-40/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5112
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:684c996a3a50f1a69fb88c68782f86a9a626380e77256851eaf5476133a0926e
|
3 |
size 5112
|
checkpoint-50/adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"k_proj",
|
24 |
"q_proj",
|
25 |
-
"
|
26 |
-
"
|
|
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"k_proj",
|
26 |
+
"o_proj"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
checkpoint-50/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 67143296
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff24760e35eba65f38905cbc2d2b23ce73feb2da6a3bb98fa083cbd3cc564571
|
3 |
size 67143296
|
checkpoint-50/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 134433530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98ab5b6b7c2997d594b9c7e48d2cd958c58dcdf4eaf60e9a5fb6497764869314
|
3 |
size 134433530
|