yyx123 commited on
Commit
0f04192
·
verified ·
1 Parent(s): 97e7769

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,11 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
  datasets:
11
- - zhihu
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-zhihu5
@@ -20,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-zhihu5
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the zhihu dataset.
24
  It achieves the following results on the evaluation set:
25
  - Loss: 2.5677
26
 
@@ -72,8 +70,8 @@ The following hyperparameters were used during training:
72
  | 2.5443 | 16.0 | 1536 | 2.5677 |
73
  | 2.5972 | 17.0 | 1632 | 2.5677 |
74
  | 2.5361 | 18.0 | 1728 | 2.5677 |
75
- | 2.5317 | 19.0 | 1824 | 2.5677 |
76
- | 2.632 | 20.0 | 1920 | 2.5677 |
77
 
78
 
79
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  base_model: 01-ai/Yi-6B
11
  model-index:
12
  - name: Yi-6B-zhihu5
 
18
 
19
  # Yi-6B-zhihu5
20
 
21
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
  - Loss: 2.5677
24
 
 
70
  | 2.5443 | 16.0 | 1536 | 2.5677 |
71
  | 2.5972 | 17.0 | 1632 | 2.5677 |
72
  | 2.5361 | 18.0 | 1728 | 2.5677 |
73
+ | 2.6119 | 19.0 | 1824 | 2.5677 |
74
+ | 2.6321 | 20.0 | 1920 | 2.5677 |
75
 
76
 
77
  ### Framework versions
adapter_config.json CHANGED
@@ -19,13 +19,13 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "gate_proj",
23
  "o_proj",
24
- "down_proj",
25
  "k_proj",
26
- "v_proj",
 
27
  "q_proj",
28
- "up_proj"
 
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
 
22
  "o_proj",
 
23
  "k_proj",
24
+ "up_proj",
25
+ "gate_proj",
26
  "q_proj",
27
+ "v_proj",
28
+ "down_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bad84df15c90e0e8949fa9beda7034fd7e4534df74aba6ea1833c0495c075d13
3
  size 72673912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68d597f15641f407924e32d9666bbea85b9409352c7904467ce912e1804fb644
3
  size 72673912
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_loss": 2.5677051544189453,
4
- "eval_runtime": 164.7624,
5
  "eval_samples": 2561,
6
- "eval_samples_per_second": 4.649,
7
- "eval_steps_per_second": 0.583,
8
- "train_loss": 0.428011018037796,
9
- "train_runtime": 2530.9924,
10
  "train_samples": 2561,
11
- "train_samples_per_second": 6.053,
12
- "train_steps_per_second": 0.759
13
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_loss": 2.5677125453948975,
4
+ "eval_runtime": 164.9005,
5
  "eval_samples": 2561,
6
+ "eval_samples_per_second": 4.645,
7
+ "eval_steps_per_second": 0.582,
8
+ "train_loss": 0.16053936282793682,
9
+ "train_runtime": 1034.0653,
10
  "train_samples": 2561,
11
+ "train_samples_per_second": 14.815,
12
+ "train_steps_per_second": 1.857
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_loss": 2.5677051544189453,
4
- "eval_runtime": 164.7624,
5
  "eval_samples": 2561,
6
- "eval_samples_per_second": 4.649,
7
- "eval_steps_per_second": 0.583
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_loss": 2.5677125453948975,
4
+ "eval_runtime": 164.9005,
5
  "eval_samples": 2561,
6
+ "eval_samples_per_second": 4.645,
7
+ "eval_steps_per_second": 0.582
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "train_loss": 0.428011018037796,
4
- "train_runtime": 2530.9924,
5
  "train_samples": 2561,
6
- "train_samples_per_second": 6.053,
7
- "train_steps_per_second": 0.759
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "train_loss": 0.16053936282793682,
4
+ "train_runtime": 1034.0653,
5
  "train_samples": 2561,
6
+ "train_samples_per_second": 14.815,
7
+ "train_steps_per_second": 1.857
8
  }
trainer_state.json CHANGED
@@ -2321,39 +2321,39 @@
2321
  {
2322
  "epoch": 18.8,
2323
  "learning_rate": 1.0888432552681405e-07,
2324
- "loss": 2.6601,
2325
  "step": 1805
2326
  },
2327
  {
2328
  "epoch": 18.85,
2329
  "learning_rate": 9.965285027552452e-08,
2330
- "loss": 2.5226,
2331
  "step": 1810
2332
  },
2333
  {
2334
  "epoch": 18.91,
2335
  "learning_rate": 9.082630146352356e-08,
2336
- "loss": 2.4359,
2337
  "step": 1815
2338
  },
2339
  {
2340
  "epoch": 18.96,
2341
  "learning_rate": 8.240540844791145e-08,
2342
- "loss": 2.5317,
2343
  "step": 1820
2344
  },
2345
  {
2346
  "epoch": 19.0,
2347
- "eval_loss": 2.567706346511841,
2348
- "eval_runtime": 164.8443,
2349
- "eval_samples_per_second": 4.647,
2350
  "eval_steps_per_second": 0.582,
2351
  "step": 1824
2352
  },
2353
  {
2354
  "epoch": 19.01,
2355
  "learning_rate": 7.439086706555743e-08,
2356
- "loss": 2.5861,
2357
  "step": 1825
2358
  },
2359
  {
@@ -2377,19 +2377,19 @@
2377
  {
2378
  "epoch": 19.22,
2379
  "learning_rate": 4.640895825593683e-08,
2380
- "loss": 2.6011,
2381
  "step": 1845
2382
  },
2383
  {
2384
  "epoch": 19.27,
2385
  "learning_rate": 4.0435435515532304e-08,
2386
- "loss": 2.6357,
2387
  "step": 1850
2388
  },
2389
  {
2390
  "epoch": 19.32,
2391
  "learning_rate": 3.487173247935627e-08,
2392
- "loss": 2.5523,
2393
  "step": 1855
2394
  },
2395
  {
@@ -2401,19 +2401,19 @@
2401
  {
2402
  "epoch": 19.43,
2403
  "learning_rate": 2.4975590581369778e-08,
2404
- "loss": 2.6604,
2405
  "step": 1865
2406
  },
2407
  {
2408
  "epoch": 19.48,
2409
  "learning_rate": 2.0643969459482326e-08,
2410
- "loss": 2.5515,
2411
  "step": 1870
2412
  },
2413
  {
2414
  "epoch": 19.53,
2415
  "learning_rate": 1.6723803454098408e-08,
2416
- "loss": 2.6021,
2417
  "step": 1875
2418
  },
2419
  {
@@ -2425,7 +2425,7 @@
2425
  {
2426
  "epoch": 19.64,
2427
  "learning_rate": 1.0119098494316693e-08,
2428
- "loss": 2.4397,
2429
  "step": 1885
2430
  },
2431
  {
@@ -2437,7 +2437,7 @@
2437
  {
2438
  "epoch": 19.74,
2439
  "learning_rate": 5.163658701989316e-09,
2440
- "loss": 2.4587,
2441
  "step": 1895
2442
  },
2443
  {
@@ -2455,7 +2455,7 @@
2455
  {
2456
  "epoch": 19.9,
2457
  "learning_rate": 8.26304875812256e-10,
2458
- "loss": 2.6275,
2459
  "step": 1910
2460
  },
2461
  {
@@ -2467,13 +2467,13 @@
2467
  {
2468
  "epoch": 20.0,
2469
  "learning_rate": 0.0,
2470
- "loss": 2.632,
2471
  "step": 1920
2472
  },
2473
  {
2474
  "epoch": 20.0,
2475
- "eval_loss": 2.5677051544189453,
2476
- "eval_runtime": 164.798,
2477
  "eval_samples_per_second": 4.648,
2478
  "eval_steps_per_second": 0.583,
2479
  "step": 1920
@@ -2482,10 +2482,10 @@
2482
  "epoch": 20.0,
2483
  "step": 1920,
2484
  "total_flos": 1.0984887148766822e+18,
2485
- "train_loss": 0.428011018037796,
2486
- "train_runtime": 2530.9924,
2487
- "train_samples_per_second": 6.053,
2488
- "train_steps_per_second": 0.759
2489
  }
2490
  ],
2491
  "logging_steps": 5,
 
2321
  {
2322
  "epoch": 18.8,
2323
  "learning_rate": 1.0888432552681405e-07,
2324
+ "loss": 2.4623,
2325
  "step": 1805
2326
  },
2327
  {
2328
  "epoch": 18.85,
2329
  "learning_rate": 9.965285027552452e-08,
2330
+ "loss": 2.538,
2331
  "step": 1810
2332
  },
2333
  {
2334
  "epoch": 18.91,
2335
  "learning_rate": 9.082630146352356e-08,
2336
+ "loss": 2.5888,
2337
  "step": 1815
2338
  },
2339
  {
2340
  "epoch": 18.96,
2341
  "learning_rate": 8.240540844791145e-08,
2342
+ "loss": 2.6119,
2343
  "step": 1820
2344
  },
2345
  {
2346
  "epoch": 19.0,
2347
+ "eval_loss": 2.567678689956665,
2348
+ "eval_runtime": 164.8569,
2349
+ "eval_samples_per_second": 4.646,
2350
  "eval_steps_per_second": 0.582,
2351
  "step": 1824
2352
  },
2353
  {
2354
  "epoch": 19.01,
2355
  "learning_rate": 7.439086706555743e-08,
2356
+ "loss": 2.6545,
2357
  "step": 1825
2358
  },
2359
  {
 
2377
  {
2378
  "epoch": 19.22,
2379
  "learning_rate": 4.640895825593683e-08,
2380
+ "loss": 2.6012,
2381
  "step": 1845
2382
  },
2383
  {
2384
  "epoch": 19.27,
2385
  "learning_rate": 4.0435435515532304e-08,
2386
+ "loss": 2.6356,
2387
  "step": 1850
2388
  },
2389
  {
2390
  "epoch": 19.32,
2391
  "learning_rate": 3.487173247935627e-08,
2392
+ "loss": 2.5524,
2393
  "step": 1855
2394
  },
2395
  {
 
2401
  {
2402
  "epoch": 19.43,
2403
  "learning_rate": 2.4975590581369778e-08,
2404
+ "loss": 2.6605,
2405
  "step": 1865
2406
  },
2407
  {
2408
  "epoch": 19.48,
2409
  "learning_rate": 2.0643969459482326e-08,
2410
+ "loss": 2.5516,
2411
  "step": 1870
2412
  },
2413
  {
2414
  "epoch": 19.53,
2415
  "learning_rate": 1.6723803454098408e-08,
2416
+ "loss": 2.6022,
2417
  "step": 1875
2418
  },
2419
  {
 
2425
  {
2426
  "epoch": 19.64,
2427
  "learning_rate": 1.0119098494316693e-08,
2428
+ "loss": 2.4395,
2429
  "step": 1885
2430
  },
2431
  {
 
2437
  {
2438
  "epoch": 19.74,
2439
  "learning_rate": 5.163658701989316e-09,
2440
+ "loss": 2.4588,
2441
  "step": 1895
2442
  },
2443
  {
 
2455
  {
2456
  "epoch": 19.9,
2457
  "learning_rate": 8.26304875812256e-10,
2458
+ "loss": 2.6277,
2459
  "step": 1910
2460
  },
2461
  {
 
2467
  {
2468
  "epoch": 20.0,
2469
  "learning_rate": 0.0,
2470
+ "loss": 2.6321,
2471
  "step": 1920
2472
  },
2473
  {
2474
  "epoch": 20.0,
2475
+ "eval_loss": 2.5677125453948975,
2476
+ "eval_runtime": 164.791,
2477
  "eval_samples_per_second": 4.648,
2478
  "eval_steps_per_second": 0.583,
2479
  "step": 1920
 
2482
  "epoch": 20.0,
2483
  "step": 1920,
2484
  "total_flos": 1.0984887148766822e+18,
2485
+ "train_loss": 0.16053936282793682,
2486
+ "train_runtime": 1034.0653,
2487
+ "train_samples_per_second": 14.815,
2488
+ "train_steps_per_second": 1.857
2489
  }
2490
  ],
2491
  "logging_steps": 5,