tanliboy commited on
Commit
5a20038
1 Parent(s): 39b1630

Model save

Browse files
README.md CHANGED
@@ -2,15 +2,12 @@
2
  license: apache-2.0
3
  base_model: Qwen/Qwen2-7B
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
 
11
  - generated_from_trainer
12
  datasets:
13
- - HuggingFaceH4/ultrachat_200k
14
  model-index:
15
  - name: zephyr-qwen2-7b-sft
16
  results: []
@@ -21,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # zephyr-qwen2-7b-sft
23
 
24
- This model is a fine-tuned version of [Qwen/Qwen2-7B](https://huggingface.co/Qwen/Qwen2-7B) on the HuggingFaceH4/ultrachat_200k dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 1.0646
27
 
28
  ## Model description
29
 
@@ -60,7 +57,7 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss |
62
  |:-------------:|:-----:|:----:|:---------------:|
63
- | 1.0626 | 1.0 | 956 | 1.0646 |
64
 
65
 
66
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: Qwen/Qwen2-7B
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
+ - alignment-handbook
8
  - generated_from_trainer
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: zephyr-qwen2-7b-sft
13
  results: []
 
18
 
19
  # zephyr-qwen2-7b-sft
20
 
21
+ This model is a fine-tuned version of [Qwen/Qwen2-7B](https://huggingface.co/Qwen/Qwen2-7B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.0645
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.0627 | 1.0 | 956 | 1.0645 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -5,10 +5,10 @@
5
  "eval_samples": 23109,
6
  "eval_samples_per_second": 47.288,
7
  "eval_steps_per_second": 0.74,
8
- "total_flos": 500925122740224.0,
9
- "train_loss": 1.0766646904426638,
10
- "train_runtime": 10839.8443,
11
  "train_samples": 207864,
12
- "train_samples_per_second": 11.286,
13
- "train_steps_per_second": 0.088
14
  }
 
5
  "eval_samples": 23109,
6
  "eval_samples_per_second": 47.288,
7
  "eval_steps_per_second": 0.74,
8
+ "total_flos": 500662995517440.0,
9
+ "train_loss": 0.06220405869902926,
10
+ "train_runtime": 877.8841,
11
  "train_samples": 207864,
12
+ "train_samples_per_second": 139.358,
13
+ "train_steps_per_second": 1.089
14
  }
config.json CHANGED
@@ -22,7 +22,7 @@
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.40.2",
25
- "use_cache": true,
26
  "use_sliding_window": false,
27
  "vocab_size": 152064
28
  }
 
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.40.2",
25
+ "use_cache": false,
26
  "use_sliding_window": false,
27
  "vocab_size": 152064
28
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f6039494ac641d99d13b6f028d2a309d117292be0301ad9e77ccb61d5d09d61
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5346f7673f73f551aaaa605516577660e1eeedcc29154ded68a8a39e1bf72c4c
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7907147daaa9fad1b0dd1ccd01bf743294c785fed19b77a288ef3e6cbe91768a
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b42de9be5847138b7967316f3dad2efd4db71d2dc2042256575769e7883a189a
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bea1812025c1808c2ebc632f73cad6038809c45373791f1789cf32259447d9d4
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90e5f201a3835937500f20e67d4c791a7124c5a060229fc1f92b17ebf3fda4b2
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3f1bf1dbca8a6fa54c435b5dee8ee34b090850ba9b780c8d6f3083db9d29355
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634e5128c71a098d1f1cad9837cc743ae34a83706e15c9cb9df0ac5d7fc76820
3
  size 1089994880
runs/Jun14_18-07-43_action-graph-trainer/events.out.tfevents.1718388492.action-graph-trainer.695665.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c86dcc545bc64c7e85477584418e0651f4252ace1b3894df3f4e8f5359dddf47
3
+ size 7807
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 500925122740224.0,
4
- "train_loss": 1.0766646904426638,
5
- "train_runtime": 10839.8443,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 11.286,
8
- "train_steps_per_second": 0.088
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 500662995517440.0,
4
+ "train_loss": 0.06220405869902926,
5
+ "train_runtime": 877.8841,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 139.358,
8
+ "train_steps_per_second": 1.089
9
  }
trainer_state.json CHANGED
@@ -1277,97 +1277,97 @@
1277
  },
1278
  {
1279
  "epoch": 0.946652719665272,
1280
- "grad_norm": 0.48247289512404495,
1281
  "learning_rate": 1.730440504639408e-07,
1282
  "loss": 1.058,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.9518828451882845,
1287
- "grad_norm": 0.48826128008832265,
1288
  "learning_rate": 1.408530770781813e-07,
1289
  "loss": 1.0526,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.9571129707112971,
1294
- "grad_norm": 0.4969532853804826,
1295
  "learning_rate": 1.1195115097079268e-07,
1296
- "loss": 1.0685,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.9623430962343096,
1301
- "grad_norm": 0.48920579432423383,
1302
  "learning_rate": 8.634791392946429e-08,
1303
  "loss": 1.0676,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.9675732217573222,
1308
- "grad_norm": 0.49070804064035406,
1309
  "learning_rate": 6.405190728721033e-08,
1310
  "loss": 1.0455,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.9728033472803347,
1315
- "grad_norm": 0.47721070672447036,
1316
  "learning_rate": 4.5070569072952485e-08,
1317
- "loss": 1.061,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.9780334728033473,
1322
- "grad_norm": 0.4960618241854113,
1323
  "learning_rate": 2.9410231530168087e-08,
1324
- "loss": 1.0499,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.9832635983263598,
1329
- "grad_norm": 0.4727473862892527,
1330
  "learning_rate": 1.7076119004429958e-08,
1331
  "loss": 1.0763,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.9884937238493724,
1336
- "grad_norm": 0.4932911494908516,
1337
  "learning_rate": 8.072346200544979e-09,
1338
  "loss": 1.0672,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.9937238493723849,
1343
- "grad_norm": 0.48937555835484964,
1344
  "learning_rate": 2.401916809872118e-09,
1345
  "loss": 1.0629,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.9989539748953975,
1350
- "grad_norm": 0.47678060880513334,
1351
  "learning_rate": 6.672250828620996e-11,
1352
- "loss": 1.0626,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 1.0,
1357
- "eval_loss": 1.0645579099655151,
1358
- "eval_runtime": 287.2685,
1359
- "eval_samples_per_second": 47.13,
1360
- "eval_steps_per_second": 0.738,
1361
  "step": 956
1362
  },
1363
  {
1364
  "epoch": 1.0,
1365
  "step": 956,
1366
- "total_flos": 500925122740224.0,
1367
- "train_loss": 1.0766646904426638,
1368
- "train_runtime": 10839.8443,
1369
- "train_samples_per_second": 11.286,
1370
- "train_steps_per_second": 0.088
1371
  }
1372
  ],
1373
  "logging_steps": 5,
@@ -1375,7 +1375,7 @@
1375
  "num_input_tokens_seen": 0,
1376
  "num_train_epochs": 1,
1377
  "save_steps": 100,
1378
- "total_flos": 500925122740224.0,
1379
  "train_batch_size": 8,
1380
  "trial_name": null,
1381
  "trial_params": null
 
1277
  },
1278
  {
1279
  "epoch": 0.946652719665272,
1280
+ "grad_norm": 0.4832426848529184,
1281
  "learning_rate": 1.730440504639408e-07,
1282
  "loss": 1.058,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.9518828451882845,
1287
+ "grad_norm": 0.48799198233407015,
1288
  "learning_rate": 1.408530770781813e-07,
1289
  "loss": 1.0526,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.9571129707112971,
1294
+ "grad_norm": 0.4969562812336113,
1295
  "learning_rate": 1.1195115097079268e-07,
1296
+ "loss": 1.0684,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.9623430962343096,
1301
+ "grad_norm": 0.4892866844912397,
1302
  "learning_rate": 8.634791392946429e-08,
1303
  "loss": 1.0676,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.9675732217573222,
1308
+ "grad_norm": 0.49055262840153824,
1309
  "learning_rate": 6.405190728721033e-08,
1310
  "loss": 1.0455,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.9728033472803347,
1315
+ "grad_norm": 0.47689555635255854,
1316
  "learning_rate": 4.5070569072952485e-08,
1317
+ "loss": 1.0609,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.9780334728033473,
1322
+ "grad_norm": 0.4955325802322405,
1323
  "learning_rate": 2.9410231530168087e-08,
1324
+ "loss": 1.0498,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.9832635983263598,
1329
+ "grad_norm": 0.47274550178714503,
1330
  "learning_rate": 1.7076119004429958e-08,
1331
  "loss": 1.0763,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.9884937238493724,
1336
+ "grad_norm": 0.49350697124044746,
1337
  "learning_rate": 8.072346200544979e-09,
1338
  "loss": 1.0672,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.9937238493723849,
1343
+ "grad_norm": 0.4887853656062252,
1344
  "learning_rate": 2.401916809872118e-09,
1345
  "loss": 1.0629,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.9989539748953975,
1350
+ "grad_norm": 0.4764956168422736,
1351
  "learning_rate": 6.672250828620996e-11,
1352
+ "loss": 1.0627,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 1.0,
1357
+ "eval_loss": 1.0645456314086914,
1358
+ "eval_runtime": 285.3875,
1359
+ "eval_samples_per_second": 47.441,
1360
+ "eval_steps_per_second": 0.743,
1361
  "step": 956
1362
  },
1363
  {
1364
  "epoch": 1.0,
1365
  "step": 956,
1366
+ "total_flos": 500662995517440.0,
1367
+ "train_loss": 0.06220405869902926,
1368
+ "train_runtime": 877.8841,
1369
+ "train_samples_per_second": 139.358,
1370
+ "train_steps_per_second": 1.089
1371
  }
1372
  ],
1373
  "logging_steps": 5,
 
1375
  "num_input_tokens_seen": 0,
1376
  "num_train_epochs": 1,
1377
  "save_steps": 100,
1378
+ "total_flos": 500662995517440.0,
1379
  "train_batch_size": 8,
1380
  "trial_name": null,
1381
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:639c1527f60111741882e196954883997aec4a9ccba7483a65a28fc1a4187617
3
  size 6200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a961b2dc9433696ebda3812a203e258f2b437c5b7c7dd434f0409fcfebe52fe
3
  size 6200