Model save
Browse files- README.md +4 -6
- adapter_config.json +4 -4
- adapter_model.safetensors +1 -1
- all_results.json +8 -8
- eval_results.json +4 -4
- train_results.json +4 -4
- trainer_state.json +24 -24
README.md
CHANGED
@@ -2,13 +2,11 @@
|
|
2 |
license: other
|
3 |
library_name: peft
|
4 |
tags:
|
5 |
-
- alignment-handbook
|
6 |
-
- generated_from_trainer
|
7 |
- trl
|
8 |
- sft
|
9 |
- generated_from_trainer
|
10 |
datasets:
|
11 |
-
-
|
12 |
base_model: 01-ai/Yi-6B
|
13 |
model-index:
|
14 |
- name: Yi-6B-zhihu5
|
@@ -20,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
|
|
20 |
|
21 |
# Yi-6B-zhihu5
|
22 |
|
23 |
-
This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the
|
24 |
It achieves the following results on the evaluation set:
|
25 |
- Loss: 2.5677
|
26 |
|
@@ -72,8 +70,8 @@ The following hyperparameters were used during training:
|
|
72 |
| 2.5443 | 16.0 | 1536 | 2.5677 |
|
73 |
| 2.5972 | 17.0 | 1632 | 2.5677 |
|
74 |
| 2.5361 | 18.0 | 1728 | 2.5677 |
|
75 |
-
| 2.
|
76 |
-
| 2.
|
77 |
|
78 |
|
79 |
### Framework versions
|
|
|
2 |
license: other
|
3 |
library_name: peft
|
4 |
tags:
|
|
|
|
|
5 |
- trl
|
6 |
- sft
|
7 |
- generated_from_trainer
|
8 |
datasets:
|
9 |
+
- generator
|
10 |
base_model: 01-ai/Yi-6B
|
11 |
model-index:
|
12 |
- name: Yi-6B-zhihu5
|
|
|
18 |
|
19 |
# Yi-6B-zhihu5
|
20 |
|
21 |
+
This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the generator dataset.
|
22 |
It achieves the following results on the evaluation set:
|
23 |
- Loss: 2.5677
|
24 |
|
|
|
70 |
| 2.5443 | 16.0 | 1536 | 2.5677 |
|
71 |
| 2.5972 | 17.0 | 1632 | 2.5677 |
|
72 |
| 2.5361 | 18.0 | 1728 | 2.5677 |
|
73 |
+
| 2.6119 | 19.0 | 1824 | 2.5677 |
|
74 |
+
| 2.6321 | 20.0 | 1920 | 2.5677 |
|
75 |
|
76 |
|
77 |
### Framework versions
|
adapter_config.json
CHANGED
@@ -19,13 +19,13 @@
|
|
19 |
"rank_pattern": {},
|
20 |
"revision": null,
|
21 |
"target_modules": [
|
22 |
-
"gate_proj",
|
23 |
"o_proj",
|
24 |
-
"down_proj",
|
25 |
"k_proj",
|
26 |
-
"
|
|
|
27 |
"q_proj",
|
28 |
-
"
|
|
|
29 |
],
|
30 |
"task_type": "CAUSAL_LM"
|
31 |
}
|
|
|
19 |
"rank_pattern": {},
|
20 |
"revision": null,
|
21 |
"target_modules": [
|
|
|
22 |
"o_proj",
|
|
|
23 |
"k_proj",
|
24 |
+
"up_proj",
|
25 |
+
"gate_proj",
|
26 |
"q_proj",
|
27 |
+
"v_proj",
|
28 |
+
"down_proj"
|
29 |
],
|
30 |
"task_type": "CAUSAL_LM"
|
31 |
}
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 72673912
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68d597f15641f407924e32d9666bbea85b9409352c7904467ce912e1804fb644
|
3 |
size 72673912
|
all_results.json
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
{
|
2 |
"epoch": 20.0,
|
3 |
-
"eval_loss": 2.
|
4 |
-
"eval_runtime": 164.
|
5 |
"eval_samples": 2561,
|
6 |
-
"eval_samples_per_second": 4.
|
7 |
-
"eval_steps_per_second": 0.
|
8 |
-
"train_loss": 0.
|
9 |
-
"train_runtime":
|
10 |
"train_samples": 2561,
|
11 |
-
"train_samples_per_second":
|
12 |
-
"train_steps_per_second":
|
13 |
}
|
|
|
1 |
{
|
2 |
"epoch": 20.0,
|
3 |
+
"eval_loss": 2.5677125453948975,
|
4 |
+
"eval_runtime": 164.9005,
|
5 |
"eval_samples": 2561,
|
6 |
+
"eval_samples_per_second": 4.645,
|
7 |
+
"eval_steps_per_second": 0.582,
|
8 |
+
"train_loss": 0.16053936282793682,
|
9 |
+
"train_runtime": 1034.0653,
|
10 |
"train_samples": 2561,
|
11 |
+
"train_samples_per_second": 14.815,
|
12 |
+
"train_steps_per_second": 1.857
|
13 |
}
|
eval_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"epoch": 20.0,
|
3 |
-
"eval_loss": 2.
|
4 |
-
"eval_runtime": 164.
|
5 |
"eval_samples": 2561,
|
6 |
-
"eval_samples_per_second": 4.
|
7 |
-
"eval_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
"epoch": 20.0,
|
3 |
+
"eval_loss": 2.5677125453948975,
|
4 |
+
"eval_runtime": 164.9005,
|
5 |
"eval_samples": 2561,
|
6 |
+
"eval_samples_per_second": 4.645,
|
7 |
+
"eval_steps_per_second": 0.582
|
8 |
}
|
train_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"epoch": 20.0,
|
3 |
-
"train_loss": 0.
|
4 |
-
"train_runtime":
|
5 |
"train_samples": 2561,
|
6 |
-
"train_samples_per_second":
|
7 |
-
"train_steps_per_second":
|
8 |
}
|
|
|
1 |
{
|
2 |
"epoch": 20.0,
|
3 |
+
"train_loss": 0.16053936282793682,
|
4 |
+
"train_runtime": 1034.0653,
|
5 |
"train_samples": 2561,
|
6 |
+
"train_samples_per_second": 14.815,
|
7 |
+
"train_steps_per_second": 1.857
|
8 |
}
|
trainer_state.json
CHANGED
@@ -2321,39 +2321,39 @@
|
|
2321 |
{
|
2322 |
"epoch": 18.8,
|
2323 |
"learning_rate": 1.0888432552681405e-07,
|
2324 |
-
"loss": 2.
|
2325 |
"step": 1805
|
2326 |
},
|
2327 |
{
|
2328 |
"epoch": 18.85,
|
2329 |
"learning_rate": 9.965285027552452e-08,
|
2330 |
-
"loss": 2.
|
2331 |
"step": 1810
|
2332 |
},
|
2333 |
{
|
2334 |
"epoch": 18.91,
|
2335 |
"learning_rate": 9.082630146352356e-08,
|
2336 |
-
"loss": 2.
|
2337 |
"step": 1815
|
2338 |
},
|
2339 |
{
|
2340 |
"epoch": 18.96,
|
2341 |
"learning_rate": 8.240540844791145e-08,
|
2342 |
-
"loss": 2.
|
2343 |
"step": 1820
|
2344 |
},
|
2345 |
{
|
2346 |
"epoch": 19.0,
|
2347 |
-
"eval_loss": 2.
|
2348 |
-
"eval_runtime": 164.
|
2349 |
-
"eval_samples_per_second": 4.
|
2350 |
"eval_steps_per_second": 0.582,
|
2351 |
"step": 1824
|
2352 |
},
|
2353 |
{
|
2354 |
"epoch": 19.01,
|
2355 |
"learning_rate": 7.439086706555743e-08,
|
2356 |
-
"loss": 2.
|
2357 |
"step": 1825
|
2358 |
},
|
2359 |
{
|
@@ -2377,19 +2377,19 @@
|
|
2377 |
{
|
2378 |
"epoch": 19.22,
|
2379 |
"learning_rate": 4.640895825593683e-08,
|
2380 |
-
"loss": 2.
|
2381 |
"step": 1845
|
2382 |
},
|
2383 |
{
|
2384 |
"epoch": 19.27,
|
2385 |
"learning_rate": 4.0435435515532304e-08,
|
2386 |
-
"loss": 2.
|
2387 |
"step": 1850
|
2388 |
},
|
2389 |
{
|
2390 |
"epoch": 19.32,
|
2391 |
"learning_rate": 3.487173247935627e-08,
|
2392 |
-
"loss": 2.
|
2393 |
"step": 1855
|
2394 |
},
|
2395 |
{
|
@@ -2401,19 +2401,19 @@
|
|
2401 |
{
|
2402 |
"epoch": 19.43,
|
2403 |
"learning_rate": 2.4975590581369778e-08,
|
2404 |
-
"loss": 2.
|
2405 |
"step": 1865
|
2406 |
},
|
2407 |
{
|
2408 |
"epoch": 19.48,
|
2409 |
"learning_rate": 2.0643969459482326e-08,
|
2410 |
-
"loss": 2.
|
2411 |
"step": 1870
|
2412 |
},
|
2413 |
{
|
2414 |
"epoch": 19.53,
|
2415 |
"learning_rate": 1.6723803454098408e-08,
|
2416 |
-
"loss": 2.
|
2417 |
"step": 1875
|
2418 |
},
|
2419 |
{
|
@@ -2425,7 +2425,7 @@
|
|
2425 |
{
|
2426 |
"epoch": 19.64,
|
2427 |
"learning_rate": 1.0119098494316693e-08,
|
2428 |
-
"loss": 2.
|
2429 |
"step": 1885
|
2430 |
},
|
2431 |
{
|
@@ -2437,7 +2437,7 @@
|
|
2437 |
{
|
2438 |
"epoch": 19.74,
|
2439 |
"learning_rate": 5.163658701989316e-09,
|
2440 |
-
"loss": 2.
|
2441 |
"step": 1895
|
2442 |
},
|
2443 |
{
|
@@ -2455,7 +2455,7 @@
|
|
2455 |
{
|
2456 |
"epoch": 19.9,
|
2457 |
"learning_rate": 8.26304875812256e-10,
|
2458 |
-
"loss": 2.
|
2459 |
"step": 1910
|
2460 |
},
|
2461 |
{
|
@@ -2467,13 +2467,13 @@
|
|
2467 |
{
|
2468 |
"epoch": 20.0,
|
2469 |
"learning_rate": 0.0,
|
2470 |
-
"loss": 2.
|
2471 |
"step": 1920
|
2472 |
},
|
2473 |
{
|
2474 |
"epoch": 20.0,
|
2475 |
-
"eval_loss": 2.
|
2476 |
-
"eval_runtime": 164.
|
2477 |
"eval_samples_per_second": 4.648,
|
2478 |
"eval_steps_per_second": 0.583,
|
2479 |
"step": 1920
|
@@ -2482,10 +2482,10 @@
|
|
2482 |
"epoch": 20.0,
|
2483 |
"step": 1920,
|
2484 |
"total_flos": 1.0984887148766822e+18,
|
2485 |
-
"train_loss": 0.
|
2486 |
-
"train_runtime":
|
2487 |
-
"train_samples_per_second":
|
2488 |
-
"train_steps_per_second":
|
2489 |
}
|
2490 |
],
|
2491 |
"logging_steps": 5,
|
|
|
2321 |
{
|
2322 |
"epoch": 18.8,
|
2323 |
"learning_rate": 1.0888432552681405e-07,
|
2324 |
+
"loss": 2.4623,
|
2325 |
"step": 1805
|
2326 |
},
|
2327 |
{
|
2328 |
"epoch": 18.85,
|
2329 |
"learning_rate": 9.965285027552452e-08,
|
2330 |
+
"loss": 2.538,
|
2331 |
"step": 1810
|
2332 |
},
|
2333 |
{
|
2334 |
"epoch": 18.91,
|
2335 |
"learning_rate": 9.082630146352356e-08,
|
2336 |
+
"loss": 2.5888,
|
2337 |
"step": 1815
|
2338 |
},
|
2339 |
{
|
2340 |
"epoch": 18.96,
|
2341 |
"learning_rate": 8.240540844791145e-08,
|
2342 |
+
"loss": 2.6119,
|
2343 |
"step": 1820
|
2344 |
},
|
2345 |
{
|
2346 |
"epoch": 19.0,
|
2347 |
+
"eval_loss": 2.567678689956665,
|
2348 |
+
"eval_runtime": 164.8569,
|
2349 |
+
"eval_samples_per_second": 4.646,
|
2350 |
"eval_steps_per_second": 0.582,
|
2351 |
"step": 1824
|
2352 |
},
|
2353 |
{
|
2354 |
"epoch": 19.01,
|
2355 |
"learning_rate": 7.439086706555743e-08,
|
2356 |
+
"loss": 2.6545,
|
2357 |
"step": 1825
|
2358 |
},
|
2359 |
{
|
|
|
2377 |
{
|
2378 |
"epoch": 19.22,
|
2379 |
"learning_rate": 4.640895825593683e-08,
|
2380 |
+
"loss": 2.6012,
|
2381 |
"step": 1845
|
2382 |
},
|
2383 |
{
|
2384 |
"epoch": 19.27,
|
2385 |
"learning_rate": 4.0435435515532304e-08,
|
2386 |
+
"loss": 2.6356,
|
2387 |
"step": 1850
|
2388 |
},
|
2389 |
{
|
2390 |
"epoch": 19.32,
|
2391 |
"learning_rate": 3.487173247935627e-08,
|
2392 |
+
"loss": 2.5524,
|
2393 |
"step": 1855
|
2394 |
},
|
2395 |
{
|
|
|
2401 |
{
|
2402 |
"epoch": 19.43,
|
2403 |
"learning_rate": 2.4975590581369778e-08,
|
2404 |
+
"loss": 2.6605,
|
2405 |
"step": 1865
|
2406 |
},
|
2407 |
{
|
2408 |
"epoch": 19.48,
|
2409 |
"learning_rate": 2.0643969459482326e-08,
|
2410 |
+
"loss": 2.5516,
|
2411 |
"step": 1870
|
2412 |
},
|
2413 |
{
|
2414 |
"epoch": 19.53,
|
2415 |
"learning_rate": 1.6723803454098408e-08,
|
2416 |
+
"loss": 2.6022,
|
2417 |
"step": 1875
|
2418 |
},
|
2419 |
{
|
|
|
2425 |
{
|
2426 |
"epoch": 19.64,
|
2427 |
"learning_rate": 1.0119098494316693e-08,
|
2428 |
+
"loss": 2.4395,
|
2429 |
"step": 1885
|
2430 |
},
|
2431 |
{
|
|
|
2437 |
{
|
2438 |
"epoch": 19.74,
|
2439 |
"learning_rate": 5.163658701989316e-09,
|
2440 |
+
"loss": 2.4588,
|
2441 |
"step": 1895
|
2442 |
},
|
2443 |
{
|
|
|
2455 |
{
|
2456 |
"epoch": 19.9,
|
2457 |
"learning_rate": 8.26304875812256e-10,
|
2458 |
+
"loss": 2.6277,
|
2459 |
"step": 1910
|
2460 |
},
|
2461 |
{
|
|
|
2467 |
{
|
2468 |
"epoch": 20.0,
|
2469 |
"learning_rate": 0.0,
|
2470 |
+
"loss": 2.6321,
|
2471 |
"step": 1920
|
2472 |
},
|
2473 |
{
|
2474 |
"epoch": 20.0,
|
2475 |
+
"eval_loss": 2.5677125453948975,
|
2476 |
+
"eval_runtime": 164.791,
|
2477 |
"eval_samples_per_second": 4.648,
|
2478 |
"eval_steps_per_second": 0.583,
|
2479 |
"step": 1920
|
|
|
2482 |
"epoch": 20.0,
|
2483 |
"step": 1920,
|
2484 |
"total_flos": 1.0984887148766822e+18,
|
2485 |
+
"train_loss": 0.16053936282793682,
|
2486 |
+
"train_runtime": 1034.0653,
|
2487 |
+
"train_samples_per_second": 14.815,
|
2488 |
+
"train_steps_per_second": 1.857
|
2489 |
}
|
2490 |
],
|
2491 |
"logging_steps": 5,
|