kevinpro commited on
Commit
9287780
·
1 Parent(s): b2b4285

Upload 38 files

Browse files
Files changed (38) hide show
  1. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/adapter_config.json +17 -0
  2. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/adapter_model.bin +3 -0
  3. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/optimizer.pt +3 -0
  4. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/pytorch_model.bin +3 -0
  5. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/rng_state_0.pth +3 -0
  6. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/rng_state_1.pth +3 -0
  7. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/rng_state_2.pth +3 -0
  8. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/rng_state_3.pth +3 -0
  9. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/scaler.pt +3 -0
  10. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/scheduler.pt +3 -0
  11. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/trainer_state.json +2160 -0
  12. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/training_args.bin +3 -0
  13. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/adapter_config.json +17 -0
  14. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/adapter_model.bin +3 -0
  15. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/optimizer.pt +3 -0
  16. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/pytorch_model.bin +3 -0
  17. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/rng_state_0.pth +3 -0
  18. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/rng_state_1.pth +3 -0
  19. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/rng_state_2.pth +3 -0
  20. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/rng_state_3.pth +3 -0
  21. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/scaler.pt +3 -0
  22. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/scheduler.pt +3 -0
  23. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/trainer_state.json +2200 -0
  24. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/training_args.bin +3 -0
  25. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/adapter_config.json +17 -0
  26. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/adapter_model.bin +3 -0
  27. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/optimizer.pt +3 -0
  28. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/pytorch_model.bin +3 -0
  29. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/rng_state_0.pth +3 -0
  30. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/rng_state_1.pth +3 -0
  31. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/rng_state_2.pth +3 -0
  32. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/rng_state_3.pth +3 -0
  33. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/scaler.pt +3 -0
  34. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/scheduler.pt +3 -0
  35. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/tokenizer.model +3 -0
  36. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/tokenizer_config.json +33 -0
  37. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/trainer_state.json +2214 -0
  38. vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/training_args.bin +3 -0
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/adapter_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "/mnt/data1/sheshuaijie/Data/PLM/vicuna-7b",
3
+ "bias": "none",
4
+ "fan_in_fan_out": false,
5
+ "inference_mode": true,
6
+ "init_lora_weights": true,
7
+ "lora_alpha": 32,
8
+ "lora_dropout": 0.1,
9
+ "modules_to_save": null,
10
+ "peft_type": "LORA",
11
+ "r": 32,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "task_type": "CAUSAL_LM"
17
+ }
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406
3
+ size 443
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b180b8c0b2d249cbded40020365b0ad58e9f9fb972021e196dcd8a6c9adf02a5
3
+ size 134293701
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42e1be51147b5d21738c841a7542f08b657b68dd69a604f10caf612f72eeb6fa
3
+ size 67154893
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e6a336f0eb31e65d470fb7bbb3c318d68f8540de99147505297369706e956cb
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a14c3fb4d26a3395451687812bd359f3fd071cba0cdbefbacc693f5cc09806b6
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f2062d9172c4b83ac74163dbe5ea73882f2aa0b6cf47bac1ee147a28f45e3f
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:791c74a8fa5a686ccf4034dc3001174c86737c9a81b7e09d7225d6fbee5c0212
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ff694532efc9aa102a9df1b12588365f8a813e88f900aa3b82f011b9ff17989
3
+ size 557
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2c98200ad231d30e6f897712cf004f180062f0ec7f8523760ba97dd2e43c318
3
+ size 627
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/trainer_state.json ADDED
@@ -0,0 +1,2160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6167441010475159,
3
+ "best_model_checkpoint": "/mnt/data1/sheshuaijie/Output/CoT/Trained/vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723",
4
+ "epoch": 9.54061855670103,
5
+ "global_step": 723,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "eval_loss": 1.7524008750915527,
13
+ "eval_runtime": 23.7951,
14
+ "eval_samples_per_second": 126.077,
15
+ "eval_steps_per_second": 3.95,
16
+ "step": 3
17
+ },
18
+ {
19
+ "epoch": 0.08,
20
+ "eval_loss": 1.5515066385269165,
21
+ "eval_runtime": 23.828,
22
+ "eval_samples_per_second": 125.902,
23
+ "eval_steps_per_second": 3.945,
24
+ "step": 6
25
+ },
26
+ {
27
+ "epoch": 0.12,
28
+ "eval_loss": 1.3584641218185425,
29
+ "eval_runtime": 23.8775,
30
+ "eval_samples_per_second": 125.641,
31
+ "eval_steps_per_second": 3.937,
32
+ "step": 9
33
+ },
34
+ {
35
+ "epoch": 0.16,
36
+ "eval_loss": 1.2644726037979126,
37
+ "eval_runtime": 23.8942,
38
+ "eval_samples_per_second": 125.554,
39
+ "eval_steps_per_second": 3.934,
40
+ "step": 12
41
+ },
42
+ {
43
+ "epoch": 0.2,
44
+ "eval_loss": 1.166400671005249,
45
+ "eval_runtime": 23.9181,
46
+ "eval_samples_per_second": 125.428,
47
+ "eval_steps_per_second": 3.93,
48
+ "step": 15
49
+ },
50
+ {
51
+ "epoch": 0.24,
52
+ "eval_loss": 1.1086052656173706,
53
+ "eval_runtime": 23.9166,
54
+ "eval_samples_per_second": 125.436,
55
+ "eval_steps_per_second": 3.93,
56
+ "step": 18
57
+ },
58
+ {
59
+ "epoch": 0.26,
60
+ "learning_rate": 0.00019650067294751011,
61
+ "loss": 1.4265,
62
+ "step": 20
63
+ },
64
+ {
65
+ "epoch": 0.28,
66
+ "eval_loss": 1.0677987337112427,
67
+ "eval_runtime": 23.9189,
68
+ "eval_samples_per_second": 125.424,
69
+ "eval_steps_per_second": 3.93,
70
+ "step": 21
71
+ },
72
+ {
73
+ "epoch": 0.32,
74
+ "eval_loss": 1.0342437028884888,
75
+ "eval_runtime": 23.9046,
76
+ "eval_samples_per_second": 125.499,
77
+ "eval_steps_per_second": 3.932,
78
+ "step": 24
79
+ },
80
+ {
81
+ "epoch": 0.36,
82
+ "eval_loss": 0.9985266923904419,
83
+ "eval_runtime": 23.9037,
84
+ "eval_samples_per_second": 125.504,
85
+ "eval_steps_per_second": 3.932,
86
+ "step": 27
87
+ },
88
+ {
89
+ "epoch": 0.4,
90
+ "eval_loss": 0.9654523134231567,
91
+ "eval_runtime": 23.9129,
92
+ "eval_samples_per_second": 125.455,
93
+ "eval_steps_per_second": 3.931,
94
+ "step": 30
95
+ },
96
+ {
97
+ "epoch": 0.44,
98
+ "eval_loss": 0.939262866973877,
99
+ "eval_runtime": 23.9117,
100
+ "eval_samples_per_second": 125.462,
101
+ "eval_steps_per_second": 3.931,
102
+ "step": 33
103
+ },
104
+ {
105
+ "epoch": 0.48,
106
+ "eval_loss": 0.9186767339706421,
107
+ "eval_runtime": 23.9011,
108
+ "eval_samples_per_second": 125.517,
109
+ "eval_steps_per_second": 3.933,
110
+ "step": 36
111
+ },
112
+ {
113
+ "epoch": 0.51,
114
+ "eval_loss": 0.8969741463661194,
115
+ "eval_runtime": 23.9105,
116
+ "eval_samples_per_second": 125.468,
117
+ "eval_steps_per_second": 3.931,
118
+ "step": 39
119
+ },
120
+ {
121
+ "epoch": 0.53,
122
+ "learning_rate": 0.00019111709286675642,
123
+ "loss": 0.9923,
124
+ "step": 40
125
+ },
126
+ {
127
+ "epoch": 0.55,
128
+ "eval_loss": 0.8814375996589661,
129
+ "eval_runtime": 23.9154,
130
+ "eval_samples_per_second": 125.442,
131
+ "eval_steps_per_second": 3.931,
132
+ "step": 42
133
+ },
134
+ {
135
+ "epoch": 0.59,
136
+ "eval_loss": 0.8654683232307434,
137
+ "eval_runtime": 23.9108,
138
+ "eval_samples_per_second": 125.466,
139
+ "eval_steps_per_second": 3.931,
140
+ "step": 45
141
+ },
142
+ {
143
+ "epoch": 0.63,
144
+ "eval_loss": 0.852226734161377,
145
+ "eval_runtime": 23.9186,
146
+ "eval_samples_per_second": 125.425,
147
+ "eval_steps_per_second": 3.93,
148
+ "step": 48
149
+ },
150
+ {
151
+ "epoch": 0.67,
152
+ "eval_loss": 0.839223325252533,
153
+ "eval_runtime": 23.9074,
154
+ "eval_samples_per_second": 125.484,
155
+ "eval_steps_per_second": 3.932,
156
+ "step": 51
157
+ },
158
+ {
159
+ "epoch": 0.71,
160
+ "eval_loss": 0.8266379237174988,
161
+ "eval_runtime": 23.9399,
162
+ "eval_samples_per_second": 125.314,
163
+ "eval_steps_per_second": 3.926,
164
+ "step": 54
165
+ },
166
+ {
167
+ "epoch": 0.75,
168
+ "eval_loss": 0.8140417337417603,
169
+ "eval_runtime": 23.9355,
170
+ "eval_samples_per_second": 125.337,
171
+ "eval_steps_per_second": 3.927,
172
+ "step": 57
173
+ },
174
+ {
175
+ "epoch": 0.79,
176
+ "learning_rate": 0.0001857335127860027,
177
+ "loss": 0.8611,
178
+ "step": 60
179
+ },
180
+ {
181
+ "epoch": 0.79,
182
+ "eval_loss": 0.8019057512283325,
183
+ "eval_runtime": 23.9223,
184
+ "eval_samples_per_second": 125.406,
185
+ "eval_steps_per_second": 3.929,
186
+ "step": 60
187
+ },
188
+ {
189
+ "epoch": 0.83,
190
+ "eval_loss": 0.7907609343528748,
191
+ "eval_runtime": 23.9384,
192
+ "eval_samples_per_second": 125.322,
193
+ "eval_steps_per_second": 3.927,
194
+ "step": 63
195
+ },
196
+ {
197
+ "epoch": 0.87,
198
+ "eval_loss": 0.7791212797164917,
199
+ "eval_runtime": 23.9101,
200
+ "eval_samples_per_second": 125.47,
201
+ "eval_steps_per_second": 3.931,
202
+ "step": 66
203
+ },
204
+ {
205
+ "epoch": 0.91,
206
+ "eval_loss": 0.7694615125656128,
207
+ "eval_runtime": 23.9079,
208
+ "eval_samples_per_second": 125.481,
209
+ "eval_steps_per_second": 3.932,
210
+ "step": 69
211
+ },
212
+ {
213
+ "epoch": 0.95,
214
+ "eval_loss": 0.7602358460426331,
215
+ "eval_runtime": 23.9116,
216
+ "eval_samples_per_second": 125.462,
217
+ "eval_steps_per_second": 3.931,
218
+ "step": 72
219
+ },
220
+ {
221
+ "epoch": 0.99,
222
+ "eval_loss": 0.753226101398468,
223
+ "eval_runtime": 23.9242,
224
+ "eval_samples_per_second": 125.396,
225
+ "eval_steps_per_second": 3.929,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 1.03,
230
+ "eval_loss": 0.7466432452201843,
231
+ "eval_runtime": 23.9116,
232
+ "eval_samples_per_second": 125.462,
233
+ "eval_steps_per_second": 3.931,
234
+ "step": 78
235
+ },
236
+ {
237
+ "epoch": 1.06,
238
+ "learning_rate": 0.000180349932705249,
239
+ "loss": 0.7843,
240
+ "step": 80
241
+ },
242
+ {
243
+ "epoch": 1.07,
244
+ "eval_loss": 0.7416810989379883,
245
+ "eval_runtime": 23.9171,
246
+ "eval_samples_per_second": 125.433,
247
+ "eval_steps_per_second": 3.93,
248
+ "step": 81
249
+ },
250
+ {
251
+ "epoch": 1.11,
252
+ "eval_loss": 0.7362396121025085,
253
+ "eval_runtime": 23.9079,
254
+ "eval_samples_per_second": 125.481,
255
+ "eval_steps_per_second": 3.932,
256
+ "step": 84
257
+ },
258
+ {
259
+ "epoch": 1.15,
260
+ "eval_loss": 0.7297741174697876,
261
+ "eval_runtime": 23.9084,
262
+ "eval_samples_per_second": 125.479,
263
+ "eval_steps_per_second": 3.932,
264
+ "step": 87
265
+ },
266
+ {
267
+ "epoch": 1.19,
268
+ "eval_loss": 0.7252654433250427,
269
+ "eval_runtime": 23.9206,
270
+ "eval_samples_per_second": 125.415,
271
+ "eval_steps_per_second": 3.93,
272
+ "step": 90
273
+ },
274
+ {
275
+ "epoch": 1.23,
276
+ "eval_loss": 0.7213409543037415,
277
+ "eval_runtime": 23.9179,
278
+ "eval_samples_per_second": 125.429,
279
+ "eval_steps_per_second": 3.93,
280
+ "step": 93
281
+ },
282
+ {
283
+ "epoch": 1.27,
284
+ "eval_loss": 0.7174035906791687,
285
+ "eval_runtime": 23.9354,
286
+ "eval_samples_per_second": 125.337,
287
+ "eval_steps_per_second": 3.927,
288
+ "step": 96
289
+ },
290
+ {
291
+ "epoch": 1.31,
292
+ "eval_loss": 0.7140380144119263,
293
+ "eval_runtime": 23.9214,
294
+ "eval_samples_per_second": 125.411,
295
+ "eval_steps_per_second": 3.93,
296
+ "step": 99
297
+ },
298
+ {
299
+ "epoch": 1.32,
300
+ "learning_rate": 0.0001749663526244953,
301
+ "loss": 0.7301,
302
+ "step": 100
303
+ },
304
+ {
305
+ "epoch": 1.35,
306
+ "eval_loss": 0.7104487419128418,
307
+ "eval_runtime": 23.9093,
308
+ "eval_samples_per_second": 125.474,
309
+ "eval_steps_per_second": 3.932,
310
+ "step": 102
311
+ },
312
+ {
313
+ "epoch": 1.39,
314
+ "eval_loss": 0.7067868113517761,
315
+ "eval_runtime": 23.9129,
316
+ "eval_samples_per_second": 125.455,
317
+ "eval_steps_per_second": 3.931,
318
+ "step": 105
319
+ },
320
+ {
321
+ "epoch": 1.43,
322
+ "eval_loss": 0.7041762471199036,
323
+ "eval_runtime": 23.9161,
324
+ "eval_samples_per_second": 125.439,
325
+ "eval_steps_per_second": 3.93,
326
+ "step": 108
327
+ },
328
+ {
329
+ "epoch": 1.46,
330
+ "eval_loss": 0.7013522982597351,
331
+ "eval_runtime": 23.9133,
332
+ "eval_samples_per_second": 125.453,
333
+ "eval_steps_per_second": 3.931,
334
+ "step": 111
335
+ },
336
+ {
337
+ "epoch": 1.5,
338
+ "eval_loss": 0.6989504098892212,
339
+ "eval_runtime": 23.9152,
340
+ "eval_samples_per_second": 125.443,
341
+ "eval_steps_per_second": 3.931,
342
+ "step": 114
343
+ },
344
+ {
345
+ "epoch": 1.54,
346
+ "eval_loss": 0.6974085569381714,
347
+ "eval_runtime": 23.9561,
348
+ "eval_samples_per_second": 125.229,
349
+ "eval_steps_per_second": 3.924,
350
+ "step": 117
351
+ },
352
+ {
353
+ "epoch": 1.58,
354
+ "learning_rate": 0.0001695827725437416,
355
+ "loss": 0.7141,
356
+ "step": 120
357
+ },
358
+ {
359
+ "epoch": 1.58,
360
+ "eval_loss": 0.6944894194602966,
361
+ "eval_runtime": 23.902,
362
+ "eval_samples_per_second": 125.512,
363
+ "eval_steps_per_second": 3.933,
364
+ "step": 120
365
+ },
366
+ {
367
+ "epoch": 1.62,
368
+ "eval_loss": 0.6929482221603394,
369
+ "eval_runtime": 23.9189,
370
+ "eval_samples_per_second": 125.424,
371
+ "eval_steps_per_second": 3.93,
372
+ "step": 123
373
+ },
374
+ {
375
+ "epoch": 1.66,
376
+ "eval_loss": 0.6903366446495056,
377
+ "eval_runtime": 23.9061,
378
+ "eval_samples_per_second": 125.491,
379
+ "eval_steps_per_second": 3.932,
380
+ "step": 126
381
+ },
382
+ {
383
+ "epoch": 1.7,
384
+ "eval_loss": 0.6882749199867249,
385
+ "eval_runtime": 23.9181,
386
+ "eval_samples_per_second": 125.428,
387
+ "eval_steps_per_second": 3.93,
388
+ "step": 129
389
+ },
390
+ {
391
+ "epoch": 1.74,
392
+ "eval_loss": 0.6863100528717041,
393
+ "eval_runtime": 23.914,
394
+ "eval_samples_per_second": 125.45,
395
+ "eval_steps_per_second": 3.931,
396
+ "step": 132
397
+ },
398
+ {
399
+ "epoch": 1.78,
400
+ "eval_loss": 0.6860549449920654,
401
+ "eval_runtime": 23.9138,
402
+ "eval_samples_per_second": 125.45,
403
+ "eval_steps_per_second": 3.931,
404
+ "step": 135
405
+ },
406
+ {
407
+ "epoch": 1.82,
408
+ "eval_loss": 0.6831715703010559,
409
+ "eval_runtime": 23.9135,
410
+ "eval_samples_per_second": 125.452,
411
+ "eval_steps_per_second": 3.931,
412
+ "step": 138
413
+ },
414
+ {
415
+ "epoch": 1.85,
416
+ "learning_rate": 0.0001641991924629879,
417
+ "loss": 0.6902,
418
+ "step": 140
419
+ },
420
+ {
421
+ "epoch": 1.86,
422
+ "eval_loss": 0.6819499731063843,
423
+ "eval_runtime": 23.8986,
424
+ "eval_samples_per_second": 125.53,
425
+ "eval_steps_per_second": 3.933,
426
+ "step": 141
427
+ },
428
+ {
429
+ "epoch": 1.9,
430
+ "eval_loss": 0.6807693839073181,
431
+ "eval_runtime": 23.9169,
432
+ "eval_samples_per_second": 125.434,
433
+ "eval_steps_per_second": 3.93,
434
+ "step": 144
435
+ },
436
+ {
437
+ "epoch": 1.94,
438
+ "eval_loss": 0.6787669062614441,
439
+ "eval_runtime": 23.9265,
440
+ "eval_samples_per_second": 125.384,
441
+ "eval_steps_per_second": 3.929,
442
+ "step": 147
443
+ },
444
+ {
445
+ "epoch": 1.98,
446
+ "eval_loss": 0.6773442625999451,
447
+ "eval_runtime": 23.9274,
448
+ "eval_samples_per_second": 125.38,
449
+ "eval_steps_per_second": 3.929,
450
+ "step": 150
451
+ },
452
+ {
453
+ "epoch": 2.02,
454
+ "eval_loss": 0.6759281158447266,
455
+ "eval_runtime": 23.9386,
456
+ "eval_samples_per_second": 125.321,
457
+ "eval_steps_per_second": 3.927,
458
+ "step": 153
459
+ },
460
+ {
461
+ "epoch": 2.06,
462
+ "eval_loss": 0.6743582487106323,
463
+ "eval_runtime": 23.9323,
464
+ "eval_samples_per_second": 125.354,
465
+ "eval_steps_per_second": 3.928,
466
+ "step": 156
467
+ },
468
+ {
469
+ "epoch": 2.1,
470
+ "eval_loss": 0.6732926368713379,
471
+ "eval_runtime": 23.9145,
472
+ "eval_samples_per_second": 125.447,
473
+ "eval_steps_per_second": 3.931,
474
+ "step": 159
475
+ },
476
+ {
477
+ "epoch": 2.11,
478
+ "learning_rate": 0.0001588156123822342,
479
+ "loss": 0.6766,
480
+ "step": 160
481
+ },
482
+ {
483
+ "epoch": 2.14,
484
+ "eval_loss": 0.6721953749656677,
485
+ "eval_runtime": 23.9073,
486
+ "eval_samples_per_second": 125.485,
487
+ "eval_steps_per_second": 3.932,
488
+ "step": 162
489
+ },
490
+ {
491
+ "epoch": 2.18,
492
+ "eval_loss": 0.6714429259300232,
493
+ "eval_runtime": 23.8955,
494
+ "eval_samples_per_second": 125.547,
495
+ "eval_steps_per_second": 3.934,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 2.22,
500
+ "eval_loss": 0.670035183429718,
501
+ "eval_runtime": 23.9431,
502
+ "eval_samples_per_second": 125.297,
503
+ "eval_steps_per_second": 3.926,
504
+ "step": 168
505
+ },
506
+ {
507
+ "epoch": 2.26,
508
+ "eval_loss": 0.6695354580879211,
509
+ "eval_runtime": 23.8875,
510
+ "eval_samples_per_second": 125.589,
511
+ "eval_steps_per_second": 3.935,
512
+ "step": 171
513
+ },
514
+ {
515
+ "epoch": 2.3,
516
+ "eval_loss": 0.6689226031303406,
517
+ "eval_runtime": 23.9185,
518
+ "eval_samples_per_second": 125.426,
519
+ "eval_steps_per_second": 3.93,
520
+ "step": 174
521
+ },
522
+ {
523
+ "epoch": 2.34,
524
+ "eval_loss": 0.6674054861068726,
525
+ "eval_runtime": 23.941,
526
+ "eval_samples_per_second": 125.308,
527
+ "eval_steps_per_second": 3.926,
528
+ "step": 177
529
+ },
530
+ {
531
+ "epoch": 2.38,
532
+ "learning_rate": 0.00015343203230148048,
533
+ "loss": 0.6743,
534
+ "step": 180
535
+ },
536
+ {
537
+ "epoch": 2.38,
538
+ "eval_loss": 0.6664847731590271,
539
+ "eval_runtime": 23.9211,
540
+ "eval_samples_per_second": 125.412,
541
+ "eval_steps_per_second": 3.93,
542
+ "step": 180
543
+ },
544
+ {
545
+ "epoch": 2.41,
546
+ "eval_loss": 0.6658627986907959,
547
+ "eval_runtime": 23.9247,
548
+ "eval_samples_per_second": 125.394,
549
+ "eval_steps_per_second": 3.929,
550
+ "step": 183
551
+ },
552
+ {
553
+ "epoch": 2.45,
554
+ "eval_loss": 0.664908766746521,
555
+ "eval_runtime": 23.9272,
556
+ "eval_samples_per_second": 125.38,
557
+ "eval_steps_per_second": 3.929,
558
+ "step": 186
559
+ },
560
+ {
561
+ "epoch": 2.49,
562
+ "eval_loss": 0.6638036966323853,
563
+ "eval_runtime": 23.9187,
564
+ "eval_samples_per_second": 125.425,
565
+ "eval_steps_per_second": 3.93,
566
+ "step": 189
567
+ },
568
+ {
569
+ "epoch": 2.53,
570
+ "eval_loss": 0.6625837683677673,
571
+ "eval_runtime": 23.9033,
572
+ "eval_samples_per_second": 125.506,
573
+ "eval_steps_per_second": 3.933,
574
+ "step": 192
575
+ },
576
+ {
577
+ "epoch": 2.57,
578
+ "eval_loss": 0.6619511842727661,
579
+ "eval_runtime": 23.8973,
580
+ "eval_samples_per_second": 125.537,
581
+ "eval_steps_per_second": 3.934,
582
+ "step": 195
583
+ },
584
+ {
585
+ "epoch": 2.61,
586
+ "eval_loss": 0.6611769199371338,
587
+ "eval_runtime": 23.9129,
588
+ "eval_samples_per_second": 125.455,
589
+ "eval_steps_per_second": 3.931,
590
+ "step": 198
591
+ },
592
+ {
593
+ "epoch": 2.64,
594
+ "learning_rate": 0.00014804845222072678,
595
+ "loss": 0.6615,
596
+ "step": 200
597
+ },
598
+ {
599
+ "epoch": 2.65,
600
+ "eval_loss": 0.6606143116950989,
601
+ "eval_runtime": 23.9126,
602
+ "eval_samples_per_second": 125.457,
603
+ "eval_steps_per_second": 3.931,
604
+ "step": 201
605
+ },
606
+ {
607
+ "epoch": 2.69,
608
+ "eval_loss": 0.6589743494987488,
609
+ "eval_runtime": 23.9135,
610
+ "eval_samples_per_second": 125.452,
611
+ "eval_steps_per_second": 3.931,
612
+ "step": 204
613
+ },
614
+ {
615
+ "epoch": 2.73,
616
+ "eval_loss": 0.6578481197357178,
617
+ "eval_runtime": 23.9217,
618
+ "eval_samples_per_second": 125.409,
619
+ "eval_steps_per_second": 3.929,
620
+ "step": 207
621
+ },
622
+ {
623
+ "epoch": 2.77,
624
+ "eval_loss": 0.6571096181869507,
625
+ "eval_runtime": 23.9415,
626
+ "eval_samples_per_second": 125.305,
627
+ "eval_steps_per_second": 3.926,
628
+ "step": 210
629
+ },
630
+ {
631
+ "epoch": 2.81,
632
+ "eval_loss": 0.656689465045929,
633
+ "eval_runtime": 23.9111,
634
+ "eval_samples_per_second": 125.465,
635
+ "eval_steps_per_second": 3.931,
636
+ "step": 213
637
+ },
638
+ {
639
+ "epoch": 2.85,
640
+ "eval_loss": 0.6556207537651062,
641
+ "eval_runtime": 23.9099,
642
+ "eval_samples_per_second": 125.471,
643
+ "eval_steps_per_second": 3.931,
644
+ "step": 216
645
+ },
646
+ {
647
+ "epoch": 2.89,
648
+ "eval_loss": 0.6546627283096313,
649
+ "eval_runtime": 23.9164,
650
+ "eval_samples_per_second": 125.437,
651
+ "eval_steps_per_second": 3.93,
652
+ "step": 219
653
+ },
654
+ {
655
+ "epoch": 2.9,
656
+ "learning_rate": 0.0001426648721399731,
657
+ "loss": 0.6564,
658
+ "step": 220
659
+ },
660
+ {
661
+ "epoch": 2.93,
662
+ "eval_loss": 0.6539400815963745,
663
+ "eval_runtime": 23.906,
664
+ "eval_samples_per_second": 125.492,
665
+ "eval_steps_per_second": 3.932,
666
+ "step": 222
667
+ },
668
+ {
669
+ "epoch": 2.97,
670
+ "eval_loss": 0.653684675693512,
671
+ "eval_runtime": 23.9251,
672
+ "eval_samples_per_second": 125.391,
673
+ "eval_steps_per_second": 3.929,
674
+ "step": 225
675
+ },
676
+ {
677
+ "epoch": 3.01,
678
+ "eval_loss": 0.6526629328727722,
679
+ "eval_runtime": 23.9289,
680
+ "eval_samples_per_second": 125.371,
681
+ "eval_steps_per_second": 3.928,
682
+ "step": 228
683
+ },
684
+ {
685
+ "epoch": 3.05,
686
+ "eval_loss": 0.6525079011917114,
687
+ "eval_runtime": 23.9193,
688
+ "eval_samples_per_second": 125.421,
689
+ "eval_steps_per_second": 3.93,
690
+ "step": 231
691
+ },
692
+ {
693
+ "epoch": 3.09,
694
+ "eval_loss": 0.6514959931373596,
695
+ "eval_runtime": 23.9574,
696
+ "eval_samples_per_second": 125.223,
697
+ "eval_steps_per_second": 3.924,
698
+ "step": 234
699
+ },
700
+ {
701
+ "epoch": 3.13,
702
+ "eval_loss": 0.6507047414779663,
703
+ "eval_runtime": 23.9234,
704
+ "eval_samples_per_second": 125.4,
705
+ "eval_steps_per_second": 3.929,
706
+ "step": 237
707
+ },
708
+ {
709
+ "epoch": 3.17,
710
+ "learning_rate": 0.00013728129205921937,
711
+ "loss": 0.6469,
712
+ "step": 240
713
+ },
714
+ {
715
+ "epoch": 3.17,
716
+ "eval_loss": 0.6504186391830444,
717
+ "eval_runtime": 23.937,
718
+ "eval_samples_per_second": 125.329,
719
+ "eval_steps_per_second": 3.927,
720
+ "step": 240
721
+ },
722
+ {
723
+ "epoch": 3.21,
724
+ "eval_loss": 0.6495808959007263,
725
+ "eval_runtime": 23.9188,
726
+ "eval_samples_per_second": 125.425,
727
+ "eval_steps_per_second": 3.93,
728
+ "step": 243
729
+ },
730
+ {
731
+ "epoch": 3.25,
732
+ "eval_loss": 0.649512529373169,
733
+ "eval_runtime": 23.9209,
734
+ "eval_samples_per_second": 125.413,
735
+ "eval_steps_per_second": 3.93,
736
+ "step": 246
737
+ },
738
+ {
739
+ "epoch": 3.29,
740
+ "eval_loss": 0.648629903793335,
741
+ "eval_runtime": 23.9137,
742
+ "eval_samples_per_second": 125.451,
743
+ "eval_steps_per_second": 3.931,
744
+ "step": 249
745
+ },
746
+ {
747
+ "epoch": 3.33,
748
+ "eval_loss": 0.6480894088745117,
749
+ "eval_runtime": 23.919,
750
+ "eval_samples_per_second": 125.423,
751
+ "eval_steps_per_second": 3.93,
752
+ "step": 252
753
+ },
754
+ {
755
+ "epoch": 3.36,
756
+ "eval_loss": 0.6474400758743286,
757
+ "eval_runtime": 23.9076,
758
+ "eval_samples_per_second": 125.483,
759
+ "eval_steps_per_second": 3.932,
760
+ "step": 255
761
+ },
762
+ {
763
+ "epoch": 3.4,
764
+ "eval_loss": 0.6468291878700256,
765
+ "eval_runtime": 23.9305,
766
+ "eval_samples_per_second": 125.363,
767
+ "eval_steps_per_second": 3.928,
768
+ "step": 258
769
+ },
770
+ {
771
+ "epoch": 3.43,
772
+ "learning_rate": 0.00013189771197846567,
773
+ "loss": 0.6463,
774
+ "step": 260
775
+ },
776
+ {
777
+ "epoch": 3.44,
778
+ "eval_loss": 0.6462663412094116,
779
+ "eval_runtime": 23.9359,
780
+ "eval_samples_per_second": 125.335,
781
+ "eval_steps_per_second": 3.927,
782
+ "step": 261
783
+ },
784
+ {
785
+ "epoch": 3.48,
786
+ "eval_loss": 0.6458565592765808,
787
+ "eval_runtime": 23.929,
788
+ "eval_samples_per_second": 125.371,
789
+ "eval_steps_per_second": 3.928,
790
+ "step": 264
791
+ },
792
+ {
793
+ "epoch": 3.52,
794
+ "eval_loss": 0.645412266254425,
795
+ "eval_runtime": 23.9362,
796
+ "eval_samples_per_second": 125.333,
797
+ "eval_steps_per_second": 3.927,
798
+ "step": 267
799
+ },
800
+ {
801
+ "epoch": 3.56,
802
+ "eval_loss": 0.6449554562568665,
803
+ "eval_runtime": 23.9004,
804
+ "eval_samples_per_second": 125.521,
805
+ "eval_steps_per_second": 3.933,
806
+ "step": 270
807
+ },
808
+ {
809
+ "epoch": 3.6,
810
+ "eval_loss": 0.6443325281143188,
811
+ "eval_runtime": 23.9065,
812
+ "eval_samples_per_second": 125.489,
813
+ "eval_steps_per_second": 3.932,
814
+ "step": 273
815
+ },
816
+ {
817
+ "epoch": 3.64,
818
+ "eval_loss": 0.6435034871101379,
819
+ "eval_runtime": 23.9072,
820
+ "eval_samples_per_second": 125.485,
821
+ "eval_steps_per_second": 3.932,
822
+ "step": 276
823
+ },
824
+ {
825
+ "epoch": 3.68,
826
+ "eval_loss": 0.6433733701705933,
827
+ "eval_runtime": 23.9042,
828
+ "eval_samples_per_second": 125.501,
829
+ "eval_steps_per_second": 3.932,
830
+ "step": 279
831
+ },
832
+ {
833
+ "epoch": 3.69,
834
+ "learning_rate": 0.00012651413189771198,
835
+ "loss": 0.6389,
836
+ "step": 280
837
+ },
838
+ {
839
+ "epoch": 3.72,
840
+ "eval_loss": 0.6425070762634277,
841
+ "eval_runtime": 23.8874,
842
+ "eval_samples_per_second": 125.589,
843
+ "eval_steps_per_second": 3.935,
844
+ "step": 282
845
+ },
846
+ {
847
+ "epoch": 3.76,
848
+ "eval_loss": 0.642119288444519,
849
+ "eval_runtime": 23.9328,
850
+ "eval_samples_per_second": 125.351,
851
+ "eval_steps_per_second": 3.928,
852
+ "step": 285
853
+ },
854
+ {
855
+ "epoch": 3.8,
856
+ "eval_loss": 0.641748309135437,
857
+ "eval_runtime": 23.9294,
858
+ "eval_samples_per_second": 125.369,
859
+ "eval_steps_per_second": 3.928,
860
+ "step": 288
861
+ },
862
+ {
863
+ "epoch": 3.84,
864
+ "eval_loss": 0.640826404094696,
865
+ "eval_runtime": 23.9434,
866
+ "eval_samples_per_second": 125.296,
867
+ "eval_steps_per_second": 3.926,
868
+ "step": 291
869
+ },
870
+ {
871
+ "epoch": 3.88,
872
+ "eval_loss": 0.6402388215065002,
873
+ "eval_runtime": 23.9162,
874
+ "eval_samples_per_second": 125.438,
875
+ "eval_steps_per_second": 3.93,
876
+ "step": 294
877
+ },
878
+ {
879
+ "epoch": 3.92,
880
+ "eval_loss": 0.6407353281974792,
881
+ "eval_runtime": 23.9121,
882
+ "eval_samples_per_second": 125.46,
883
+ "eval_steps_per_second": 3.931,
884
+ "step": 297
885
+ },
886
+ {
887
+ "epoch": 3.96,
888
+ "learning_rate": 0.0001211305518169583,
889
+ "loss": 0.6318,
890
+ "step": 300
891
+ },
892
+ {
893
+ "epoch": 3.96,
894
+ "eval_loss": 0.6398600935935974,
895
+ "eval_runtime": 23.9229,
896
+ "eval_samples_per_second": 125.403,
897
+ "eval_steps_per_second": 3.929,
898
+ "step": 300
899
+ },
900
+ {
901
+ "epoch": 4.0,
902
+ "eval_loss": 0.6393464207649231,
903
+ "eval_runtime": 23.9187,
904
+ "eval_samples_per_second": 125.425,
905
+ "eval_steps_per_second": 3.93,
906
+ "step": 303
907
+ },
908
+ {
909
+ "epoch": 4.04,
910
+ "eval_loss": 0.6392526626586914,
911
+ "eval_runtime": 23.9074,
912
+ "eval_samples_per_second": 125.484,
913
+ "eval_steps_per_second": 3.932,
914
+ "step": 306
915
+ },
916
+ {
917
+ "epoch": 4.08,
918
+ "eval_loss": 0.6389594078063965,
919
+ "eval_runtime": 23.918,
920
+ "eval_samples_per_second": 125.428,
921
+ "eval_steps_per_second": 3.93,
922
+ "step": 309
923
+ },
924
+ {
925
+ "epoch": 4.12,
926
+ "eval_loss": 0.6388808488845825,
927
+ "eval_runtime": 23.9158,
928
+ "eval_samples_per_second": 125.44,
929
+ "eval_steps_per_second": 3.93,
930
+ "step": 312
931
+ },
932
+ {
933
+ "epoch": 4.16,
934
+ "eval_loss": 0.6384025812149048,
935
+ "eval_runtime": 23.9176,
936
+ "eval_samples_per_second": 125.431,
937
+ "eval_steps_per_second": 3.93,
938
+ "step": 315
939
+ },
940
+ {
941
+ "epoch": 4.2,
942
+ "eval_loss": 0.6387144923210144,
943
+ "eval_runtime": 23.9047,
944
+ "eval_samples_per_second": 125.498,
945
+ "eval_steps_per_second": 3.932,
946
+ "step": 318
947
+ },
948
+ {
949
+ "epoch": 4.22,
950
+ "learning_rate": 0.00011574697173620459,
951
+ "loss": 0.6277,
952
+ "step": 320
953
+ },
954
+ {
955
+ "epoch": 4.24,
956
+ "eval_loss": 0.6377059817314148,
957
+ "eval_runtime": 23.9246,
958
+ "eval_samples_per_second": 125.394,
959
+ "eval_steps_per_second": 3.929,
960
+ "step": 321
961
+ },
962
+ {
963
+ "epoch": 4.28,
964
+ "eval_loss": 0.636981189250946,
965
+ "eval_runtime": 23.9459,
966
+ "eval_samples_per_second": 125.283,
967
+ "eval_steps_per_second": 3.926,
968
+ "step": 324
969
+ },
970
+ {
971
+ "epoch": 4.32,
972
+ "eval_loss": 0.6364036202430725,
973
+ "eval_runtime": 23.9206,
974
+ "eval_samples_per_second": 125.415,
975
+ "eval_steps_per_second": 3.93,
976
+ "step": 327
977
+ },
978
+ {
979
+ "epoch": 4.35,
980
+ "eval_loss": 0.6357031464576721,
981
+ "eval_runtime": 23.9187,
982
+ "eval_samples_per_second": 125.425,
983
+ "eval_steps_per_second": 3.93,
984
+ "step": 330
985
+ },
986
+ {
987
+ "epoch": 4.39,
988
+ "eval_loss": 0.6366411447525024,
989
+ "eval_runtime": 23.9159,
990
+ "eval_samples_per_second": 125.44,
991
+ "eval_steps_per_second": 3.93,
992
+ "step": 333
993
+ },
994
+ {
995
+ "epoch": 4.43,
996
+ "eval_loss": 0.6357526183128357,
997
+ "eval_runtime": 23.9135,
998
+ "eval_samples_per_second": 125.452,
999
+ "eval_steps_per_second": 3.931,
1000
+ "step": 336
1001
+ },
1002
+ {
1003
+ "epoch": 4.47,
1004
+ "eval_loss": 0.6349912881851196,
1005
+ "eval_runtime": 23.9211,
1006
+ "eval_samples_per_second": 125.412,
1007
+ "eval_steps_per_second": 3.93,
1008
+ "step": 339
1009
+ },
1010
+ {
1011
+ "epoch": 4.49,
1012
+ "learning_rate": 0.00011036339165545088,
1013
+ "loss": 0.6303,
1014
+ "step": 340
1015
+ },
1016
+ {
1017
+ "epoch": 4.51,
1018
+ "eval_loss": 0.6343324184417725,
1019
+ "eval_runtime": 23.927,
1020
+ "eval_samples_per_second": 125.381,
1021
+ "eval_steps_per_second": 3.929,
1022
+ "step": 342
1023
+ },
1024
+ {
1025
+ "epoch": 4.55,
1026
+ "eval_loss": 0.6347218751907349,
1027
+ "eval_runtime": 23.9489,
1028
+ "eval_samples_per_second": 125.267,
1029
+ "eval_steps_per_second": 3.925,
1030
+ "step": 345
1031
+ },
1032
+ {
1033
+ "epoch": 4.59,
1034
+ "eval_loss": 0.6333290338516235,
1035
+ "eval_runtime": 23.9573,
1036
+ "eval_samples_per_second": 125.223,
1037
+ "eval_steps_per_second": 3.924,
1038
+ "step": 348
1039
+ },
1040
+ {
1041
+ "epoch": 4.63,
1042
+ "eval_loss": 0.6328045129776001,
1043
+ "eval_runtime": 23.925,
1044
+ "eval_samples_per_second": 125.392,
1045
+ "eval_steps_per_second": 3.929,
1046
+ "step": 351
1047
+ },
1048
+ {
1049
+ "epoch": 4.67,
1050
+ "eval_loss": 0.6328830718994141,
1051
+ "eval_runtime": 23.9277,
1052
+ "eval_samples_per_second": 125.378,
1053
+ "eval_steps_per_second": 3.928,
1054
+ "step": 354
1055
+ },
1056
+ {
1057
+ "epoch": 4.71,
1058
+ "eval_loss": 0.6323109269142151,
1059
+ "eval_runtime": 23.9385,
1060
+ "eval_samples_per_second": 125.321,
1061
+ "eval_steps_per_second": 3.927,
1062
+ "step": 357
1063
+ },
1064
+ {
1065
+ "epoch": 4.75,
1066
+ "learning_rate": 0.00010497981157469719,
1067
+ "loss": 0.6268,
1068
+ "step": 360
1069
+ },
1070
+ {
1071
+ "epoch": 4.75,
1072
+ "eval_loss": 0.6327587366104126,
1073
+ "eval_runtime": 23.9389,
1074
+ "eval_samples_per_second": 125.319,
1075
+ "eval_steps_per_second": 3.927,
1076
+ "step": 360
1077
+ },
1078
+ {
1079
+ "epoch": 4.79,
1080
+ "eval_loss": 0.6324266791343689,
1081
+ "eval_runtime": 23.9367,
1082
+ "eval_samples_per_second": 125.331,
1083
+ "eval_steps_per_second": 3.927,
1084
+ "step": 363
1085
+ },
1086
+ {
1087
+ "epoch": 4.83,
1088
+ "eval_loss": 0.6320524215698242,
1089
+ "eval_runtime": 23.9373,
1090
+ "eval_samples_per_second": 125.327,
1091
+ "eval_steps_per_second": 3.927,
1092
+ "step": 366
1093
+ },
1094
+ {
1095
+ "epoch": 4.87,
1096
+ "eval_loss": 0.6314539313316345,
1097
+ "eval_runtime": 23.9325,
1098
+ "eval_samples_per_second": 125.352,
1099
+ "eval_steps_per_second": 3.928,
1100
+ "step": 369
1101
+ },
1102
+ {
1103
+ "epoch": 4.91,
1104
+ "eval_loss": 0.6318089365959167,
1105
+ "eval_runtime": 23.9345,
1106
+ "eval_samples_per_second": 125.342,
1107
+ "eval_steps_per_second": 3.927,
1108
+ "step": 372
1109
+ },
1110
+ {
1111
+ "epoch": 4.95,
1112
+ "eval_loss": 0.6315808296203613,
1113
+ "eval_runtime": 23.924,
1114
+ "eval_samples_per_second": 125.397,
1115
+ "eval_steps_per_second": 3.929,
1116
+ "step": 375
1117
+ },
1118
+ {
1119
+ "epoch": 4.99,
1120
+ "eval_loss": 0.630818247795105,
1121
+ "eval_runtime": 23.9285,
1122
+ "eval_samples_per_second": 125.373,
1123
+ "eval_steps_per_second": 3.928,
1124
+ "step": 378
1125
+ },
1126
+ {
1127
+ "epoch": 5.01,
1128
+ "learning_rate": 9.959623149394348e-05,
1129
+ "loss": 0.6196,
1130
+ "step": 380
1131
+ },
1132
+ {
1133
+ "epoch": 5.03,
1134
+ "eval_loss": 0.630248486995697,
1135
+ "eval_runtime": 23.9231,
1136
+ "eval_samples_per_second": 125.402,
1137
+ "eval_steps_per_second": 3.929,
1138
+ "step": 381
1139
+ },
1140
+ {
1141
+ "epoch": 5.07,
1142
+ "eval_loss": 0.6306143403053284,
1143
+ "eval_runtime": 23.9242,
1144
+ "eval_samples_per_second": 125.396,
1145
+ "eval_steps_per_second": 3.929,
1146
+ "step": 384
1147
+ },
1148
+ {
1149
+ "epoch": 5.11,
1150
+ "eval_loss": 0.6305729746818542,
1151
+ "eval_runtime": 23.9232,
1152
+ "eval_samples_per_second": 125.401,
1153
+ "eval_steps_per_second": 3.929,
1154
+ "step": 387
1155
+ },
1156
+ {
1157
+ "epoch": 5.15,
1158
+ "eval_loss": 0.6302648782730103,
1159
+ "eval_runtime": 23.9286,
1160
+ "eval_samples_per_second": 125.373,
1161
+ "eval_steps_per_second": 3.928,
1162
+ "step": 390
1163
+ },
1164
+ {
1165
+ "epoch": 5.19,
1166
+ "eval_loss": 0.6298710703849792,
1167
+ "eval_runtime": 23.9258,
1168
+ "eval_samples_per_second": 125.388,
1169
+ "eval_steps_per_second": 3.929,
1170
+ "step": 393
1171
+ },
1172
+ {
1173
+ "epoch": 5.23,
1174
+ "eval_loss": 0.6298263669013977,
1175
+ "eval_runtime": 23.9284,
1176
+ "eval_samples_per_second": 125.374,
1177
+ "eval_steps_per_second": 3.928,
1178
+ "step": 396
1179
+ },
1180
+ {
1181
+ "epoch": 5.27,
1182
+ "eval_loss": 0.6292470097541809,
1183
+ "eval_runtime": 23.9269,
1184
+ "eval_samples_per_second": 125.382,
1185
+ "eval_steps_per_second": 3.929,
1186
+ "step": 399
1187
+ },
1188
+ {
1189
+ "epoch": 5.28,
1190
+ "learning_rate": 9.421265141318977e-05,
1191
+ "loss": 0.6146,
1192
+ "step": 400
1193
+ },
1194
+ {
1195
+ "epoch": 5.3,
1196
+ "eval_loss": 0.6291049122810364,
1197
+ "eval_runtime": 23.9297,
1198
+ "eval_samples_per_second": 125.367,
1199
+ "eval_steps_per_second": 3.928,
1200
+ "step": 402
1201
+ },
1202
+ {
1203
+ "epoch": 5.34,
1204
+ "eval_loss": 0.6296722292900085,
1205
+ "eval_runtime": 23.9386,
1206
+ "eval_samples_per_second": 125.321,
1207
+ "eval_steps_per_second": 3.927,
1208
+ "step": 405
1209
+ },
1210
+ {
1211
+ "epoch": 5.38,
1212
+ "eval_loss": 0.6288275122642517,
1213
+ "eval_runtime": 23.9308,
1214
+ "eval_samples_per_second": 125.362,
1215
+ "eval_steps_per_second": 3.928,
1216
+ "step": 408
1217
+ },
1218
+ {
1219
+ "epoch": 5.42,
1220
+ "eval_loss": 0.6288333535194397,
1221
+ "eval_runtime": 23.9261,
1222
+ "eval_samples_per_second": 125.386,
1223
+ "eval_steps_per_second": 3.929,
1224
+ "step": 411
1225
+ },
1226
+ {
1227
+ "epoch": 5.46,
1228
+ "eval_loss": 0.6279690861701965,
1229
+ "eval_runtime": 23.9282,
1230
+ "eval_samples_per_second": 125.375,
1231
+ "eval_steps_per_second": 3.928,
1232
+ "step": 414
1233
+ },
1234
+ {
1235
+ "epoch": 5.5,
1236
+ "eval_loss": 0.6275332570075989,
1237
+ "eval_runtime": 23.9215,
1238
+ "eval_samples_per_second": 125.41,
1239
+ "eval_steps_per_second": 3.93,
1240
+ "step": 417
1241
+ },
1242
+ {
1243
+ "epoch": 5.54,
1244
+ "learning_rate": 8.882907133243608e-05,
1245
+ "loss": 0.6149,
1246
+ "step": 420
1247
+ },
1248
+ {
1249
+ "epoch": 5.54,
1250
+ "eval_loss": 0.6279338598251343,
1251
+ "eval_runtime": 23.93,
1252
+ "eval_samples_per_second": 125.366,
1253
+ "eval_steps_per_second": 3.928,
1254
+ "step": 420
1255
+ },
1256
+ {
1257
+ "epoch": 5.58,
1258
+ "eval_loss": 0.6271057724952698,
1259
+ "eval_runtime": 23.9158,
1260
+ "eval_samples_per_second": 125.44,
1261
+ "eval_steps_per_second": 3.93,
1262
+ "step": 423
1263
+ },
1264
+ {
1265
+ "epoch": 5.62,
1266
+ "eval_loss": 0.6270298361778259,
1267
+ "eval_runtime": 23.9264,
1268
+ "eval_samples_per_second": 125.384,
1269
+ "eval_steps_per_second": 3.929,
1270
+ "step": 426
1271
+ },
1272
+ {
1273
+ "epoch": 5.66,
1274
+ "eval_loss": 0.6271407604217529,
1275
+ "eval_runtime": 23.9362,
1276
+ "eval_samples_per_second": 125.333,
1277
+ "eval_steps_per_second": 3.927,
1278
+ "step": 429
1279
+ },
1280
+ {
1281
+ "epoch": 5.7,
1282
+ "eval_loss": 0.6264240145683289,
1283
+ "eval_runtime": 23.9309,
1284
+ "eval_samples_per_second": 125.361,
1285
+ "eval_steps_per_second": 3.928,
1286
+ "step": 432
1287
+ },
1288
+ {
1289
+ "epoch": 5.74,
1290
+ "eval_loss": 0.6263339519500732,
1291
+ "eval_runtime": 23.93,
1292
+ "eval_samples_per_second": 125.366,
1293
+ "eval_steps_per_second": 3.928,
1294
+ "step": 435
1295
+ },
1296
+ {
1297
+ "epoch": 5.78,
1298
+ "eval_loss": 0.6256468296051025,
1299
+ "eval_runtime": 23.9252,
1300
+ "eval_samples_per_second": 125.391,
1301
+ "eval_steps_per_second": 3.929,
1302
+ "step": 438
1303
+ },
1304
+ {
1305
+ "epoch": 5.81,
1306
+ "learning_rate": 8.344549125168237e-05,
1307
+ "loss": 0.6191,
1308
+ "step": 440
1309
+ },
1310
+ {
1311
+ "epoch": 5.82,
1312
+ "eval_loss": 0.6260586977005005,
1313
+ "eval_runtime": 23.918,
1314
+ "eval_samples_per_second": 125.429,
1315
+ "eval_steps_per_second": 3.93,
1316
+ "step": 441
1317
+ },
1318
+ {
1319
+ "epoch": 5.86,
1320
+ "eval_loss": 0.625337541103363,
1321
+ "eval_runtime": 23.9172,
1322
+ "eval_samples_per_second": 125.433,
1323
+ "eval_steps_per_second": 3.93,
1324
+ "step": 444
1325
+ },
1326
+ {
1327
+ "epoch": 5.9,
1328
+ "eval_loss": 0.6246620416641235,
1329
+ "eval_runtime": 23.908,
1330
+ "eval_samples_per_second": 125.481,
1331
+ "eval_steps_per_second": 3.932,
1332
+ "step": 447
1333
+ },
1334
+ {
1335
+ "epoch": 5.94,
1336
+ "eval_loss": 0.6251673698425293,
1337
+ "eval_runtime": 23.9137,
1338
+ "eval_samples_per_second": 125.451,
1339
+ "eval_steps_per_second": 3.931,
1340
+ "step": 450
1341
+ },
1342
+ {
1343
+ "epoch": 5.98,
1344
+ "eval_loss": 0.6253092288970947,
1345
+ "eval_runtime": 23.9409,
1346
+ "eval_samples_per_second": 125.309,
1347
+ "eval_steps_per_second": 3.926,
1348
+ "step": 453
1349
+ },
1350
+ {
1351
+ "epoch": 6.02,
1352
+ "eval_loss": 0.6245599389076233,
1353
+ "eval_runtime": 23.9233,
1354
+ "eval_samples_per_second": 125.401,
1355
+ "eval_steps_per_second": 3.929,
1356
+ "step": 456
1357
+ },
1358
+ {
1359
+ "epoch": 6.06,
1360
+ "eval_loss": 0.6247097849845886,
1361
+ "eval_runtime": 23.9184,
1362
+ "eval_samples_per_second": 125.426,
1363
+ "eval_steps_per_second": 3.93,
1364
+ "step": 459
1365
+ },
1366
+ {
1367
+ "epoch": 6.07,
1368
+ "learning_rate": 7.806191117092868e-05,
1369
+ "loss": 0.6075,
1370
+ "step": 460
1371
+ },
1372
+ {
1373
+ "epoch": 6.1,
1374
+ "eval_loss": 0.6237714290618896,
1375
+ "eval_runtime": 23.9137,
1376
+ "eval_samples_per_second": 125.451,
1377
+ "eval_steps_per_second": 3.931,
1378
+ "step": 462
1379
+ },
1380
+ {
1381
+ "epoch": 6.14,
1382
+ "eval_loss": 0.6239632964134216,
1383
+ "eval_runtime": 23.9142,
1384
+ "eval_samples_per_second": 125.449,
1385
+ "eval_steps_per_second": 3.931,
1386
+ "step": 465
1387
+ },
1388
+ {
1389
+ "epoch": 6.18,
1390
+ "eval_loss": 0.6246253252029419,
1391
+ "eval_runtime": 23.9199,
1392
+ "eval_samples_per_second": 125.419,
1393
+ "eval_steps_per_second": 3.93,
1394
+ "step": 468
1395
+ },
1396
+ {
1397
+ "epoch": 6.22,
1398
+ "eval_loss": 0.6236398220062256,
1399
+ "eval_runtime": 23.9169,
1400
+ "eval_samples_per_second": 125.434,
1401
+ "eval_steps_per_second": 3.93,
1402
+ "step": 471
1403
+ },
1404
+ {
1405
+ "epoch": 6.25,
1406
+ "eval_loss": 0.6242309808731079,
1407
+ "eval_runtime": 23.9091,
1408
+ "eval_samples_per_second": 125.475,
1409
+ "eval_steps_per_second": 3.932,
1410
+ "step": 474
1411
+ },
1412
+ {
1413
+ "epoch": 6.29,
1414
+ "eval_loss": 0.6236902475357056,
1415
+ "eval_runtime": 23.9117,
1416
+ "eval_samples_per_second": 125.462,
1417
+ "eval_steps_per_second": 3.931,
1418
+ "step": 477
1419
+ },
1420
+ {
1421
+ "epoch": 6.33,
1422
+ "learning_rate": 7.267833109017497e-05,
1423
+ "loss": 0.6061,
1424
+ "step": 480
1425
+ },
1426
+ {
1427
+ "epoch": 6.33,
1428
+ "eval_loss": 0.623267650604248,
1429
+ "eval_runtime": 23.9071,
1430
+ "eval_samples_per_second": 125.485,
1431
+ "eval_steps_per_second": 3.932,
1432
+ "step": 480
1433
+ },
1434
+ {
1435
+ "epoch": 6.37,
1436
+ "eval_loss": 0.6238719820976257,
1437
+ "eval_runtime": 23.9206,
1438
+ "eval_samples_per_second": 125.415,
1439
+ "eval_steps_per_second": 3.93,
1440
+ "step": 483
1441
+ },
1442
+ {
1443
+ "epoch": 6.41,
1444
+ "eval_loss": 0.6234752535820007,
1445
+ "eval_runtime": 23.914,
1446
+ "eval_samples_per_second": 125.449,
1447
+ "eval_steps_per_second": 3.931,
1448
+ "step": 486
1449
+ },
1450
+ {
1451
+ "epoch": 6.45,
1452
+ "eval_loss": 0.6228368878364563,
1453
+ "eval_runtime": 23.9087,
1454
+ "eval_samples_per_second": 125.477,
1455
+ "eval_steps_per_second": 3.932,
1456
+ "step": 489
1457
+ },
1458
+ {
1459
+ "epoch": 6.49,
1460
+ "eval_loss": 0.6226744055747986,
1461
+ "eval_runtime": 23.9118,
1462
+ "eval_samples_per_second": 125.461,
1463
+ "eval_steps_per_second": 3.931,
1464
+ "step": 492
1465
+ },
1466
+ {
1467
+ "epoch": 6.53,
1468
+ "eval_loss": 0.622622013092041,
1469
+ "eval_runtime": 23.9341,
1470
+ "eval_samples_per_second": 125.344,
1471
+ "eval_steps_per_second": 3.927,
1472
+ "step": 495
1473
+ },
1474
+ {
1475
+ "epoch": 6.57,
1476
+ "eval_loss": 0.6228298544883728,
1477
+ "eval_runtime": 23.9079,
1478
+ "eval_samples_per_second": 125.482,
1479
+ "eval_steps_per_second": 3.932,
1480
+ "step": 498
1481
+ },
1482
+ {
1483
+ "epoch": 6.6,
1484
+ "learning_rate": 6.729475100942126e-05,
1485
+ "loss": 0.6043,
1486
+ "step": 500
1487
+ },
1488
+ {
1489
+ "epoch": 6.61,
1490
+ "eval_loss": 0.6232237815856934,
1491
+ "eval_runtime": 23.8982,
1492
+ "eval_samples_per_second": 125.533,
1493
+ "eval_steps_per_second": 3.933,
1494
+ "step": 501
1495
+ },
1496
+ {
1497
+ "epoch": 6.65,
1498
+ "eval_loss": 0.6218205690383911,
1499
+ "eval_runtime": 23.9059,
1500
+ "eval_samples_per_second": 125.492,
1501
+ "eval_steps_per_second": 3.932,
1502
+ "step": 504
1503
+ },
1504
+ {
1505
+ "epoch": 6.69,
1506
+ "eval_loss": 0.621903657913208,
1507
+ "eval_runtime": 23.8991,
1508
+ "eval_samples_per_second": 125.528,
1509
+ "eval_steps_per_second": 3.933,
1510
+ "step": 507
1511
+ },
1512
+ {
1513
+ "epoch": 6.73,
1514
+ "eval_loss": 0.622235894203186,
1515
+ "eval_runtime": 23.9024,
1516
+ "eval_samples_per_second": 125.51,
1517
+ "eval_steps_per_second": 3.933,
1518
+ "step": 510
1519
+ },
1520
+ {
1521
+ "epoch": 6.77,
1522
+ "eval_loss": 0.6220830082893372,
1523
+ "eval_runtime": 23.8926,
1524
+ "eval_samples_per_second": 125.562,
1525
+ "eval_steps_per_second": 3.934,
1526
+ "step": 513
1527
+ },
1528
+ {
1529
+ "epoch": 6.81,
1530
+ "eval_loss": 0.6220167875289917,
1531
+ "eval_runtime": 23.8965,
1532
+ "eval_samples_per_second": 125.542,
1533
+ "eval_steps_per_second": 3.934,
1534
+ "step": 516
1535
+ },
1536
+ {
1537
+ "epoch": 6.85,
1538
+ "eval_loss": 0.6222782135009766,
1539
+ "eval_runtime": 23.908,
1540
+ "eval_samples_per_second": 125.481,
1541
+ "eval_steps_per_second": 3.932,
1542
+ "step": 519
1543
+ },
1544
+ {
1545
+ "epoch": 6.86,
1546
+ "learning_rate": 6.191117092866757e-05,
1547
+ "loss": 0.6008,
1548
+ "step": 520
1549
+ },
1550
+ {
1551
+ "epoch": 6.89,
1552
+ "eval_loss": 0.6216304302215576,
1553
+ "eval_runtime": 23.9036,
1554
+ "eval_samples_per_second": 125.504,
1555
+ "eval_steps_per_second": 3.932,
1556
+ "step": 522
1557
+ },
1558
+ {
1559
+ "epoch": 6.93,
1560
+ "eval_loss": 0.6217759847640991,
1561
+ "eval_runtime": 23.9088,
1562
+ "eval_samples_per_second": 125.477,
1563
+ "eval_steps_per_second": 3.932,
1564
+ "step": 525
1565
+ },
1566
+ {
1567
+ "epoch": 6.97,
1568
+ "eval_loss": 0.6214317083358765,
1569
+ "eval_runtime": 23.9177,
1570
+ "eval_samples_per_second": 125.43,
1571
+ "eval_steps_per_second": 3.93,
1572
+ "step": 528
1573
+ },
1574
+ {
1575
+ "epoch": 7.01,
1576
+ "eval_loss": 0.6213416457176208,
1577
+ "eval_runtime": 23.9138,
1578
+ "eval_samples_per_second": 125.451,
1579
+ "eval_steps_per_second": 3.931,
1580
+ "step": 531
1581
+ },
1582
+ {
1583
+ "epoch": 7.05,
1584
+ "eval_loss": 0.6217712163925171,
1585
+ "eval_runtime": 23.9141,
1586
+ "eval_samples_per_second": 125.449,
1587
+ "eval_steps_per_second": 3.931,
1588
+ "step": 534
1589
+ },
1590
+ {
1591
+ "epoch": 7.09,
1592
+ "eval_loss": 0.6215860843658447,
1593
+ "eval_runtime": 23.9145,
1594
+ "eval_samples_per_second": 125.447,
1595
+ "eval_steps_per_second": 3.931,
1596
+ "step": 537
1597
+ },
1598
+ {
1599
+ "epoch": 7.13,
1600
+ "learning_rate": 5.652759084791387e-05,
1601
+ "loss": 0.599,
1602
+ "step": 540
1603
+ },
1604
+ {
1605
+ "epoch": 7.13,
1606
+ "eval_loss": 0.6211041808128357,
1607
+ "eval_runtime": 23.9125,
1608
+ "eval_samples_per_second": 125.457,
1609
+ "eval_steps_per_second": 3.931,
1610
+ "step": 540
1611
+ },
1612
+ {
1613
+ "epoch": 7.17,
1614
+ "eval_loss": 0.6210355758666992,
1615
+ "eval_runtime": 23.911,
1616
+ "eval_samples_per_second": 125.465,
1617
+ "eval_steps_per_second": 3.931,
1618
+ "step": 543
1619
+ },
1620
+ {
1621
+ "epoch": 7.2,
1622
+ "eval_loss": 0.6209889650344849,
1623
+ "eval_runtime": 23.9062,
1624
+ "eval_samples_per_second": 125.491,
1625
+ "eval_steps_per_second": 3.932,
1626
+ "step": 546
1627
+ },
1628
+ {
1629
+ "epoch": 7.24,
1630
+ "eval_loss": 0.6205114126205444,
1631
+ "eval_runtime": 23.9227,
1632
+ "eval_samples_per_second": 125.404,
1633
+ "eval_steps_per_second": 3.929,
1634
+ "step": 549
1635
+ },
1636
+ {
1637
+ "epoch": 7.28,
1638
+ "eval_loss": 0.6204013824462891,
1639
+ "eval_runtime": 23.9146,
1640
+ "eval_samples_per_second": 125.446,
1641
+ "eval_steps_per_second": 3.931,
1642
+ "step": 552
1643
+ },
1644
+ {
1645
+ "epoch": 7.32,
1646
+ "eval_loss": 0.6202988028526306,
1647
+ "eval_runtime": 23.9015,
1648
+ "eval_samples_per_second": 125.515,
1649
+ "eval_steps_per_second": 3.933,
1650
+ "step": 555
1651
+ },
1652
+ {
1653
+ "epoch": 7.36,
1654
+ "eval_loss": 0.6199727654457092,
1655
+ "eval_runtime": 23.9089,
1656
+ "eval_samples_per_second": 125.476,
1657
+ "eval_steps_per_second": 3.932,
1658
+ "step": 558
1659
+ },
1660
+ {
1661
+ "epoch": 7.39,
1662
+ "learning_rate": 5.1144010767160164e-05,
1663
+ "loss": 0.5959,
1664
+ "step": 560
1665
+ },
1666
+ {
1667
+ "epoch": 7.4,
1668
+ "eval_loss": 0.619968831539154,
1669
+ "eval_runtime": 23.9043,
1670
+ "eval_samples_per_second": 125.5,
1671
+ "eval_steps_per_second": 3.932,
1672
+ "step": 561
1673
+ },
1674
+ {
1675
+ "epoch": 7.44,
1676
+ "eval_loss": 0.6202374696731567,
1677
+ "eval_runtime": 23.9117,
1678
+ "eval_samples_per_second": 125.461,
1679
+ "eval_steps_per_second": 3.931,
1680
+ "step": 564
1681
+ },
1682
+ {
1683
+ "epoch": 7.48,
1684
+ "eval_loss": 0.6202066540718079,
1685
+ "eval_runtime": 23.908,
1686
+ "eval_samples_per_second": 125.481,
1687
+ "eval_steps_per_second": 3.932,
1688
+ "step": 567
1689
+ },
1690
+ {
1691
+ "epoch": 7.52,
1692
+ "eval_loss": 0.6198835968971252,
1693
+ "eval_runtime": 23.9244,
1694
+ "eval_samples_per_second": 125.395,
1695
+ "eval_steps_per_second": 3.929,
1696
+ "step": 570
1697
+ },
1698
+ {
1699
+ "epoch": 7.56,
1700
+ "eval_loss": 0.6199198961257935,
1701
+ "eval_runtime": 23.9263,
1702
+ "eval_samples_per_second": 125.385,
1703
+ "eval_steps_per_second": 3.929,
1704
+ "step": 573
1705
+ },
1706
+ {
1707
+ "epoch": 7.6,
1708
+ "eval_loss": 0.6195517182350159,
1709
+ "eval_runtime": 23.9125,
1710
+ "eval_samples_per_second": 125.457,
1711
+ "eval_steps_per_second": 3.931,
1712
+ "step": 576
1713
+ },
1714
+ {
1715
+ "epoch": 7.64,
1716
+ "eval_loss": 0.6192638278007507,
1717
+ "eval_runtime": 23.9168,
1718
+ "eval_samples_per_second": 125.435,
1719
+ "eval_steps_per_second": 3.93,
1720
+ "step": 579
1721
+ },
1722
+ {
1723
+ "epoch": 7.65,
1724
+ "learning_rate": 4.576043068640646e-05,
1725
+ "loss": 0.5922,
1726
+ "step": 580
1727
+ },
1728
+ {
1729
+ "epoch": 7.68,
1730
+ "eval_loss": 0.6196587085723877,
1731
+ "eval_runtime": 23.9305,
1732
+ "eval_samples_per_second": 125.363,
1733
+ "eval_steps_per_second": 3.928,
1734
+ "step": 582
1735
+ },
1736
+ {
1737
+ "epoch": 7.72,
1738
+ "eval_loss": 0.6198856830596924,
1739
+ "eval_runtime": 23.919,
1740
+ "eval_samples_per_second": 125.423,
1741
+ "eval_steps_per_second": 3.93,
1742
+ "step": 585
1743
+ },
1744
+ {
1745
+ "epoch": 7.76,
1746
+ "eval_loss": 0.6196783781051636,
1747
+ "eval_runtime": 23.9042,
1748
+ "eval_samples_per_second": 125.501,
1749
+ "eval_steps_per_second": 3.932,
1750
+ "step": 588
1751
+ },
1752
+ {
1753
+ "epoch": 7.8,
1754
+ "eval_loss": 0.6192678809165955,
1755
+ "eval_runtime": 23.9057,
1756
+ "eval_samples_per_second": 125.493,
1757
+ "eval_steps_per_second": 3.932,
1758
+ "step": 591
1759
+ },
1760
+ {
1761
+ "epoch": 7.84,
1762
+ "eval_loss": 0.6192264556884766,
1763
+ "eval_runtime": 23.909,
1764
+ "eval_samples_per_second": 125.476,
1765
+ "eval_steps_per_second": 3.932,
1766
+ "step": 594
1767
+ },
1768
+ {
1769
+ "epoch": 7.88,
1770
+ "eval_loss": 0.6192458271980286,
1771
+ "eval_runtime": 23.8917,
1772
+ "eval_samples_per_second": 125.567,
1773
+ "eval_steps_per_second": 3.934,
1774
+ "step": 597
1775
+ },
1776
+ {
1777
+ "epoch": 7.92,
1778
+ "learning_rate": 4.037685060565276e-05,
1779
+ "loss": 0.6028,
1780
+ "step": 600
1781
+ },
1782
+ {
1783
+ "epoch": 7.92,
1784
+ "eval_loss": 0.6192883849143982,
1785
+ "eval_runtime": 23.9005,
1786
+ "eval_samples_per_second": 125.521,
1787
+ "eval_steps_per_second": 3.933,
1788
+ "step": 600
1789
+ },
1790
+ {
1791
+ "epoch": 7.96,
1792
+ "eval_loss": 0.6194872856140137,
1793
+ "eval_runtime": 23.9118,
1794
+ "eval_samples_per_second": 125.461,
1795
+ "eval_steps_per_second": 3.931,
1796
+ "step": 603
1797
+ },
1798
+ {
1799
+ "epoch": 8.0,
1800
+ "eval_loss": 0.619368314743042,
1801
+ "eval_runtime": 23.9121,
1802
+ "eval_samples_per_second": 125.46,
1803
+ "eval_steps_per_second": 3.931,
1804
+ "step": 606
1805
+ },
1806
+ {
1807
+ "epoch": 8.04,
1808
+ "eval_loss": 0.6190740466117859,
1809
+ "eval_runtime": 23.9192,
1810
+ "eval_samples_per_second": 125.422,
1811
+ "eval_steps_per_second": 3.93,
1812
+ "step": 609
1813
+ },
1814
+ {
1815
+ "epoch": 8.08,
1816
+ "eval_loss": 0.6190269589424133,
1817
+ "eval_runtime": 23.916,
1818
+ "eval_samples_per_second": 125.439,
1819
+ "eval_steps_per_second": 3.93,
1820
+ "step": 612
1821
+ },
1822
+ {
1823
+ "epoch": 8.12,
1824
+ "eval_loss": 0.6189839839935303,
1825
+ "eval_runtime": 23.9063,
1826
+ "eval_samples_per_second": 125.49,
1827
+ "eval_steps_per_second": 3.932,
1828
+ "step": 615
1829
+ },
1830
+ {
1831
+ "epoch": 8.16,
1832
+ "eval_loss": 0.618523895740509,
1833
+ "eval_runtime": 23.8928,
1834
+ "eval_samples_per_second": 125.561,
1835
+ "eval_steps_per_second": 3.934,
1836
+ "step": 618
1837
+ },
1838
+ {
1839
+ "epoch": 8.18,
1840
+ "learning_rate": 3.499327052489906e-05,
1841
+ "loss": 0.5941,
1842
+ "step": 620
1843
+ },
1844
+ {
1845
+ "epoch": 8.19,
1846
+ "eval_loss": 0.6187476515769958,
1847
+ "eval_runtime": 23.9016,
1848
+ "eval_samples_per_second": 125.515,
1849
+ "eval_steps_per_second": 3.933,
1850
+ "step": 621
1851
+ },
1852
+ {
1853
+ "epoch": 8.23,
1854
+ "eval_loss": 0.6186578869819641,
1855
+ "eval_runtime": 23.9003,
1856
+ "eval_samples_per_second": 125.521,
1857
+ "eval_steps_per_second": 3.933,
1858
+ "step": 624
1859
+ },
1860
+ {
1861
+ "epoch": 8.27,
1862
+ "eval_loss": 0.6190162897109985,
1863
+ "eval_runtime": 23.9155,
1864
+ "eval_samples_per_second": 125.442,
1865
+ "eval_steps_per_second": 3.931,
1866
+ "step": 627
1867
+ },
1868
+ {
1869
+ "epoch": 8.31,
1870
+ "eval_loss": 0.6189883351325989,
1871
+ "eval_runtime": 23.9091,
1872
+ "eval_samples_per_second": 125.475,
1873
+ "eval_steps_per_second": 3.932,
1874
+ "step": 630
1875
+ },
1876
+ {
1877
+ "epoch": 8.35,
1878
+ "eval_loss": 0.6184096932411194,
1879
+ "eval_runtime": 23.9042,
1880
+ "eval_samples_per_second": 125.501,
1881
+ "eval_steps_per_second": 3.932,
1882
+ "step": 633
1883
+ },
1884
+ {
1885
+ "epoch": 8.39,
1886
+ "eval_loss": 0.6180031895637512,
1887
+ "eval_runtime": 23.9189,
1888
+ "eval_samples_per_second": 125.424,
1889
+ "eval_steps_per_second": 3.93,
1890
+ "step": 636
1891
+ },
1892
+ {
1893
+ "epoch": 8.43,
1894
+ "eval_loss": 0.6179867386817932,
1895
+ "eval_runtime": 23.922,
1896
+ "eval_samples_per_second": 125.407,
1897
+ "eval_steps_per_second": 3.929,
1898
+ "step": 639
1899
+ },
1900
+ {
1901
+ "epoch": 8.45,
1902
+ "learning_rate": 2.960969044414536e-05,
1903
+ "loss": 0.5906,
1904
+ "step": 640
1905
+ },
1906
+ {
1907
+ "epoch": 8.47,
1908
+ "eval_loss": 0.6182823777198792,
1909
+ "eval_runtime": 23.9158,
1910
+ "eval_samples_per_second": 125.44,
1911
+ "eval_steps_per_second": 3.93,
1912
+ "step": 642
1913
+ },
1914
+ {
1915
+ "epoch": 8.51,
1916
+ "eval_loss": 0.6179353594779968,
1917
+ "eval_runtime": 23.914,
1918
+ "eval_samples_per_second": 125.45,
1919
+ "eval_steps_per_second": 3.931,
1920
+ "step": 645
1921
+ },
1922
+ {
1923
+ "epoch": 8.55,
1924
+ "eval_loss": 0.6178385615348816,
1925
+ "eval_runtime": 23.9079,
1926
+ "eval_samples_per_second": 125.482,
1927
+ "eval_steps_per_second": 3.932,
1928
+ "step": 648
1929
+ },
1930
+ {
1931
+ "epoch": 8.59,
1932
+ "eval_loss": 0.6179868578910828,
1933
+ "eval_runtime": 23.9189,
1934
+ "eval_samples_per_second": 125.424,
1935
+ "eval_steps_per_second": 3.93,
1936
+ "step": 651
1937
+ },
1938
+ {
1939
+ "epoch": 8.63,
1940
+ "eval_loss": 0.6179595589637756,
1941
+ "eval_runtime": 23.9327,
1942
+ "eval_samples_per_second": 125.352,
1943
+ "eval_steps_per_second": 3.928,
1944
+ "step": 654
1945
+ },
1946
+ {
1947
+ "epoch": 8.67,
1948
+ "eval_loss": 0.6177854537963867,
1949
+ "eval_runtime": 23.9123,
1950
+ "eval_samples_per_second": 125.458,
1951
+ "eval_steps_per_second": 3.931,
1952
+ "step": 657
1953
+ },
1954
+ {
1955
+ "epoch": 8.71,
1956
+ "learning_rate": 2.422611036339166e-05,
1957
+ "loss": 0.5908,
1958
+ "step": 660
1959
+ },
1960
+ {
1961
+ "epoch": 8.71,
1962
+ "eval_loss": 0.6179735064506531,
1963
+ "eval_runtime": 23.9135,
1964
+ "eval_samples_per_second": 125.452,
1965
+ "eval_steps_per_second": 3.931,
1966
+ "step": 660
1967
+ },
1968
+ {
1969
+ "epoch": 8.75,
1970
+ "eval_loss": 0.6180996298789978,
1971
+ "eval_runtime": 23.9038,
1972
+ "eval_samples_per_second": 125.503,
1973
+ "eval_steps_per_second": 3.932,
1974
+ "step": 663
1975
+ },
1976
+ {
1977
+ "epoch": 8.79,
1978
+ "eval_loss": 0.6181532740592957,
1979
+ "eval_runtime": 23.9058,
1980
+ "eval_samples_per_second": 125.492,
1981
+ "eval_steps_per_second": 3.932,
1982
+ "step": 666
1983
+ },
1984
+ {
1985
+ "epoch": 8.83,
1986
+ "eval_loss": 0.6176265478134155,
1987
+ "eval_runtime": 23.9128,
1988
+ "eval_samples_per_second": 125.456,
1989
+ "eval_steps_per_second": 3.931,
1990
+ "step": 669
1991
+ },
1992
+ {
1993
+ "epoch": 8.87,
1994
+ "eval_loss": 0.617388904094696,
1995
+ "eval_runtime": 23.8906,
1996
+ "eval_samples_per_second": 125.572,
1997
+ "eval_steps_per_second": 3.935,
1998
+ "step": 672
1999
+ },
2000
+ {
2001
+ "epoch": 8.91,
2002
+ "eval_loss": 0.6174699068069458,
2003
+ "eval_runtime": 23.9051,
2004
+ "eval_samples_per_second": 125.496,
2005
+ "eval_steps_per_second": 3.932,
2006
+ "step": 675
2007
+ },
2008
+ {
2009
+ "epoch": 8.95,
2010
+ "eval_loss": 0.6174932718276978,
2011
+ "eval_runtime": 23.8969,
2012
+ "eval_samples_per_second": 125.54,
2013
+ "eval_steps_per_second": 3.934,
2014
+ "step": 678
2015
+ },
2016
+ {
2017
+ "epoch": 8.97,
2018
+ "learning_rate": 1.8842530282637954e-05,
2019
+ "loss": 0.5937,
2020
+ "step": 680
2021
+ },
2022
+ {
2023
+ "epoch": 8.99,
2024
+ "eval_loss": 0.6174784898757935,
2025
+ "eval_runtime": 23.9056,
2026
+ "eval_samples_per_second": 125.494,
2027
+ "eval_steps_per_second": 3.932,
2028
+ "step": 681
2029
+ },
2030
+ {
2031
+ "epoch": 9.03,
2032
+ "eval_loss": 0.6175104975700378,
2033
+ "eval_runtime": 23.9136,
2034
+ "eval_samples_per_second": 125.452,
2035
+ "eval_steps_per_second": 3.931,
2036
+ "step": 684
2037
+ },
2038
+ {
2039
+ "epoch": 9.07,
2040
+ "eval_loss": 0.6173563599586487,
2041
+ "eval_runtime": 23.9237,
2042
+ "eval_samples_per_second": 125.399,
2043
+ "eval_steps_per_second": 3.929,
2044
+ "step": 687
2045
+ },
2046
+ {
2047
+ "epoch": 9.11,
2048
+ "eval_loss": 0.6172643899917603,
2049
+ "eval_runtime": 23.9021,
2050
+ "eval_samples_per_second": 125.512,
2051
+ "eval_steps_per_second": 3.933,
2052
+ "step": 690
2053
+ },
2054
+ {
2055
+ "epoch": 9.14,
2056
+ "eval_loss": 0.6172318458557129,
2057
+ "eval_runtime": 23.9242,
2058
+ "eval_samples_per_second": 125.396,
2059
+ "eval_steps_per_second": 3.929,
2060
+ "step": 693
2061
+ },
2062
+ {
2063
+ "epoch": 9.18,
2064
+ "eval_loss": 0.617030680179596,
2065
+ "eval_runtime": 23.9184,
2066
+ "eval_samples_per_second": 125.427,
2067
+ "eval_steps_per_second": 3.93,
2068
+ "step": 696
2069
+ },
2070
+ {
2071
+ "epoch": 9.22,
2072
+ "eval_loss": 0.6169764995574951,
2073
+ "eval_runtime": 23.9104,
2074
+ "eval_samples_per_second": 125.468,
2075
+ "eval_steps_per_second": 3.931,
2076
+ "step": 699
2077
+ },
2078
+ {
2079
+ "epoch": 9.24,
2080
+ "learning_rate": 1.3458950201884254e-05,
2081
+ "loss": 0.5867,
2082
+ "step": 700
2083
+ },
2084
+ {
2085
+ "epoch": 9.26,
2086
+ "eval_loss": 0.6171083450317383,
2087
+ "eval_runtime": 23.9126,
2088
+ "eval_samples_per_second": 125.457,
2089
+ "eval_steps_per_second": 3.931,
2090
+ "step": 702
2091
+ },
2092
+ {
2093
+ "epoch": 9.3,
2094
+ "eval_loss": 0.6171473860740662,
2095
+ "eval_runtime": 23.9015,
2096
+ "eval_samples_per_second": 125.515,
2097
+ "eval_steps_per_second": 3.933,
2098
+ "step": 705
2099
+ },
2100
+ {
2101
+ "epoch": 9.34,
2102
+ "eval_loss": 0.6170982122421265,
2103
+ "eval_runtime": 23.9023,
2104
+ "eval_samples_per_second": 125.511,
2105
+ "eval_steps_per_second": 3.933,
2106
+ "step": 708
2107
+ },
2108
+ {
2109
+ "epoch": 9.38,
2110
+ "eval_loss": 0.6169420480728149,
2111
+ "eval_runtime": 23.9074,
2112
+ "eval_samples_per_second": 125.484,
2113
+ "eval_steps_per_second": 3.932,
2114
+ "step": 711
2115
+ },
2116
+ {
2117
+ "epoch": 9.42,
2118
+ "eval_loss": 0.6168730854988098,
2119
+ "eval_runtime": 23.9047,
2120
+ "eval_samples_per_second": 125.498,
2121
+ "eval_steps_per_second": 3.932,
2122
+ "step": 714
2123
+ },
2124
+ {
2125
+ "epoch": 9.46,
2126
+ "eval_loss": 0.6168663501739502,
2127
+ "eval_runtime": 23.9161,
2128
+ "eval_samples_per_second": 125.439,
2129
+ "eval_steps_per_second": 3.93,
2130
+ "step": 717
2131
+ },
2132
+ {
2133
+ "epoch": 9.5,
2134
+ "learning_rate": 8.075370121130552e-06,
2135
+ "loss": 0.5891,
2136
+ "step": 720
2137
+ },
2138
+ {
2139
+ "epoch": 9.5,
2140
+ "eval_loss": 0.6167708039283752,
2141
+ "eval_runtime": 23.9114,
2142
+ "eval_samples_per_second": 125.463,
2143
+ "eval_steps_per_second": 3.931,
2144
+ "step": 720
2145
+ },
2146
+ {
2147
+ "epoch": 9.54,
2148
+ "eval_loss": 0.6167441010475159,
2149
+ "eval_runtime": 23.9086,
2150
+ "eval_samples_per_second": 125.478,
2151
+ "eval_steps_per_second": 3.932,
2152
+ "step": 723
2153
+ }
2154
+ ],
2155
+ "max_steps": 750,
2156
+ "num_train_epochs": 10,
2157
+ "total_flos": 7.640934816744997e+18,
2158
+ "trial_name": null,
2159
+ "trial_params": null
2160
+ }
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d5a386ce8addef927bca0e390e9534e0877a3e4e00f222f83967dd78c49527a
3
+ size 4027
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/adapter_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "/mnt/data1/sheshuaijie/Data/PLM/vicuna-7b",
3
+ "bias": "none",
4
+ "fan_in_fan_out": false,
5
+ "inference_mode": true,
6
+ "init_lora_weights": true,
7
+ "lora_alpha": 32,
8
+ "lora_dropout": 0.1,
9
+ "modules_to_save": null,
10
+ "peft_type": "LORA",
11
+ "r": 32,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "task_type": "CAUSAL_LM"
17
+ }
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406
3
+ size 443
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea0e58084f5c10cf3198c266194c6df608b78ef7e6bec56f2f2b2f79df7e8203
3
+ size 134293701
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839608048df3cb485a5a46df3fefc629f75b7a28bc82368b2b0a3f241428230a
3
+ size 67154893
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:662a8264d6501814b20daf7fc26f37adcc57cdcddf3f93b6635ccf090850f087
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da501a848bbcc5c6d41d8d84bf7b38dac81809aa86b59f1ccf322ab2ea71ea07
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c884cfa9665b1300551525efbbf6b6a770236f7c0054643e8504b79dc00b8df
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17fa542e5ca43798e13f106972566af70af0bf43dd5709d5b87e499a7242fcdc
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85ae2de3ba224920992db9fbe1ca60ba4dc264aff58b35933f515fc1ba4b30c1
3
+ size 557
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cad30e23141c113d7a635066ba568eb5da726ce1057d35c2c89d31da5cfb390
3
+ size 627
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/trainer_state.json ADDED
@@ -0,0 +1,2200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6167441010475159,
3
+ "best_model_checkpoint": "/mnt/data1/sheshuaijie/Output/CoT/Trained/vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723",
4
+ "epoch": 9.738556701030928,
5
+ "global_step": 738,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "eval_loss": 1.7524008750915527,
13
+ "eval_runtime": 23.7951,
14
+ "eval_samples_per_second": 126.077,
15
+ "eval_steps_per_second": 3.95,
16
+ "step": 3
17
+ },
18
+ {
19
+ "epoch": 0.08,
20
+ "eval_loss": 1.5515066385269165,
21
+ "eval_runtime": 23.828,
22
+ "eval_samples_per_second": 125.902,
23
+ "eval_steps_per_second": 3.945,
24
+ "step": 6
25
+ },
26
+ {
27
+ "epoch": 0.12,
28
+ "eval_loss": 1.3584641218185425,
29
+ "eval_runtime": 23.8775,
30
+ "eval_samples_per_second": 125.641,
31
+ "eval_steps_per_second": 3.937,
32
+ "step": 9
33
+ },
34
+ {
35
+ "epoch": 0.16,
36
+ "eval_loss": 1.2644726037979126,
37
+ "eval_runtime": 23.8942,
38
+ "eval_samples_per_second": 125.554,
39
+ "eval_steps_per_second": 3.934,
40
+ "step": 12
41
+ },
42
+ {
43
+ "epoch": 0.2,
44
+ "eval_loss": 1.166400671005249,
45
+ "eval_runtime": 23.9181,
46
+ "eval_samples_per_second": 125.428,
47
+ "eval_steps_per_second": 3.93,
48
+ "step": 15
49
+ },
50
+ {
51
+ "epoch": 0.24,
52
+ "eval_loss": 1.1086052656173706,
53
+ "eval_runtime": 23.9166,
54
+ "eval_samples_per_second": 125.436,
55
+ "eval_steps_per_second": 3.93,
56
+ "step": 18
57
+ },
58
+ {
59
+ "epoch": 0.26,
60
+ "learning_rate": 0.00019650067294751011,
61
+ "loss": 1.4265,
62
+ "step": 20
63
+ },
64
+ {
65
+ "epoch": 0.28,
66
+ "eval_loss": 1.0677987337112427,
67
+ "eval_runtime": 23.9189,
68
+ "eval_samples_per_second": 125.424,
69
+ "eval_steps_per_second": 3.93,
70
+ "step": 21
71
+ },
72
+ {
73
+ "epoch": 0.32,
74
+ "eval_loss": 1.0342437028884888,
75
+ "eval_runtime": 23.9046,
76
+ "eval_samples_per_second": 125.499,
77
+ "eval_steps_per_second": 3.932,
78
+ "step": 24
79
+ },
80
+ {
81
+ "epoch": 0.36,
82
+ "eval_loss": 0.9985266923904419,
83
+ "eval_runtime": 23.9037,
84
+ "eval_samples_per_second": 125.504,
85
+ "eval_steps_per_second": 3.932,
86
+ "step": 27
87
+ },
88
+ {
89
+ "epoch": 0.4,
90
+ "eval_loss": 0.9654523134231567,
91
+ "eval_runtime": 23.9129,
92
+ "eval_samples_per_second": 125.455,
93
+ "eval_steps_per_second": 3.931,
94
+ "step": 30
95
+ },
96
+ {
97
+ "epoch": 0.44,
98
+ "eval_loss": 0.939262866973877,
99
+ "eval_runtime": 23.9117,
100
+ "eval_samples_per_second": 125.462,
101
+ "eval_steps_per_second": 3.931,
102
+ "step": 33
103
+ },
104
+ {
105
+ "epoch": 0.48,
106
+ "eval_loss": 0.9186767339706421,
107
+ "eval_runtime": 23.9011,
108
+ "eval_samples_per_second": 125.517,
109
+ "eval_steps_per_second": 3.933,
110
+ "step": 36
111
+ },
112
+ {
113
+ "epoch": 0.51,
114
+ "eval_loss": 0.8969741463661194,
115
+ "eval_runtime": 23.9105,
116
+ "eval_samples_per_second": 125.468,
117
+ "eval_steps_per_second": 3.931,
118
+ "step": 39
119
+ },
120
+ {
121
+ "epoch": 0.53,
122
+ "learning_rate": 0.00019111709286675642,
123
+ "loss": 0.9923,
124
+ "step": 40
125
+ },
126
+ {
127
+ "epoch": 0.55,
128
+ "eval_loss": 0.8814375996589661,
129
+ "eval_runtime": 23.9154,
130
+ "eval_samples_per_second": 125.442,
131
+ "eval_steps_per_second": 3.931,
132
+ "step": 42
133
+ },
134
+ {
135
+ "epoch": 0.59,
136
+ "eval_loss": 0.8654683232307434,
137
+ "eval_runtime": 23.9108,
138
+ "eval_samples_per_second": 125.466,
139
+ "eval_steps_per_second": 3.931,
140
+ "step": 45
141
+ },
142
+ {
143
+ "epoch": 0.63,
144
+ "eval_loss": 0.852226734161377,
145
+ "eval_runtime": 23.9186,
146
+ "eval_samples_per_second": 125.425,
147
+ "eval_steps_per_second": 3.93,
148
+ "step": 48
149
+ },
150
+ {
151
+ "epoch": 0.67,
152
+ "eval_loss": 0.839223325252533,
153
+ "eval_runtime": 23.9074,
154
+ "eval_samples_per_second": 125.484,
155
+ "eval_steps_per_second": 3.932,
156
+ "step": 51
157
+ },
158
+ {
159
+ "epoch": 0.71,
160
+ "eval_loss": 0.8266379237174988,
161
+ "eval_runtime": 23.9399,
162
+ "eval_samples_per_second": 125.314,
163
+ "eval_steps_per_second": 3.926,
164
+ "step": 54
165
+ },
166
+ {
167
+ "epoch": 0.75,
168
+ "eval_loss": 0.8140417337417603,
169
+ "eval_runtime": 23.9355,
170
+ "eval_samples_per_second": 125.337,
171
+ "eval_steps_per_second": 3.927,
172
+ "step": 57
173
+ },
174
+ {
175
+ "epoch": 0.79,
176
+ "learning_rate": 0.0001857335127860027,
177
+ "loss": 0.8611,
178
+ "step": 60
179
+ },
180
+ {
181
+ "epoch": 0.79,
182
+ "eval_loss": 0.8019057512283325,
183
+ "eval_runtime": 23.9223,
184
+ "eval_samples_per_second": 125.406,
185
+ "eval_steps_per_second": 3.929,
186
+ "step": 60
187
+ },
188
+ {
189
+ "epoch": 0.83,
190
+ "eval_loss": 0.7907609343528748,
191
+ "eval_runtime": 23.9384,
192
+ "eval_samples_per_second": 125.322,
193
+ "eval_steps_per_second": 3.927,
194
+ "step": 63
195
+ },
196
+ {
197
+ "epoch": 0.87,
198
+ "eval_loss": 0.7791212797164917,
199
+ "eval_runtime": 23.9101,
200
+ "eval_samples_per_second": 125.47,
201
+ "eval_steps_per_second": 3.931,
202
+ "step": 66
203
+ },
204
+ {
205
+ "epoch": 0.91,
206
+ "eval_loss": 0.7694615125656128,
207
+ "eval_runtime": 23.9079,
208
+ "eval_samples_per_second": 125.481,
209
+ "eval_steps_per_second": 3.932,
210
+ "step": 69
211
+ },
212
+ {
213
+ "epoch": 0.95,
214
+ "eval_loss": 0.7602358460426331,
215
+ "eval_runtime": 23.9116,
216
+ "eval_samples_per_second": 125.462,
217
+ "eval_steps_per_second": 3.931,
218
+ "step": 72
219
+ },
220
+ {
221
+ "epoch": 0.99,
222
+ "eval_loss": 0.753226101398468,
223
+ "eval_runtime": 23.9242,
224
+ "eval_samples_per_second": 125.396,
225
+ "eval_steps_per_second": 3.929,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 1.03,
230
+ "eval_loss": 0.7466432452201843,
231
+ "eval_runtime": 23.9116,
232
+ "eval_samples_per_second": 125.462,
233
+ "eval_steps_per_second": 3.931,
234
+ "step": 78
235
+ },
236
+ {
237
+ "epoch": 1.06,
238
+ "learning_rate": 0.000180349932705249,
239
+ "loss": 0.7843,
240
+ "step": 80
241
+ },
242
+ {
243
+ "epoch": 1.07,
244
+ "eval_loss": 0.7416810989379883,
245
+ "eval_runtime": 23.9171,
246
+ "eval_samples_per_second": 125.433,
247
+ "eval_steps_per_second": 3.93,
248
+ "step": 81
249
+ },
250
+ {
251
+ "epoch": 1.11,
252
+ "eval_loss": 0.7362396121025085,
253
+ "eval_runtime": 23.9079,
254
+ "eval_samples_per_second": 125.481,
255
+ "eval_steps_per_second": 3.932,
256
+ "step": 84
257
+ },
258
+ {
259
+ "epoch": 1.15,
260
+ "eval_loss": 0.7297741174697876,
261
+ "eval_runtime": 23.9084,
262
+ "eval_samples_per_second": 125.479,
263
+ "eval_steps_per_second": 3.932,
264
+ "step": 87
265
+ },
266
+ {
267
+ "epoch": 1.19,
268
+ "eval_loss": 0.7252654433250427,
269
+ "eval_runtime": 23.9206,
270
+ "eval_samples_per_second": 125.415,
271
+ "eval_steps_per_second": 3.93,
272
+ "step": 90
273
+ },
274
+ {
275
+ "epoch": 1.23,
276
+ "eval_loss": 0.7213409543037415,
277
+ "eval_runtime": 23.9179,
278
+ "eval_samples_per_second": 125.429,
279
+ "eval_steps_per_second": 3.93,
280
+ "step": 93
281
+ },
282
+ {
283
+ "epoch": 1.27,
284
+ "eval_loss": 0.7174035906791687,
285
+ "eval_runtime": 23.9354,
286
+ "eval_samples_per_second": 125.337,
287
+ "eval_steps_per_second": 3.927,
288
+ "step": 96
289
+ },
290
+ {
291
+ "epoch": 1.31,
292
+ "eval_loss": 0.7140380144119263,
293
+ "eval_runtime": 23.9214,
294
+ "eval_samples_per_second": 125.411,
295
+ "eval_steps_per_second": 3.93,
296
+ "step": 99
297
+ },
298
+ {
299
+ "epoch": 1.32,
300
+ "learning_rate": 0.0001749663526244953,
301
+ "loss": 0.7301,
302
+ "step": 100
303
+ },
304
+ {
305
+ "epoch": 1.35,
306
+ "eval_loss": 0.7104487419128418,
307
+ "eval_runtime": 23.9093,
308
+ "eval_samples_per_second": 125.474,
309
+ "eval_steps_per_second": 3.932,
310
+ "step": 102
311
+ },
312
+ {
313
+ "epoch": 1.39,
314
+ "eval_loss": 0.7067868113517761,
315
+ "eval_runtime": 23.9129,
316
+ "eval_samples_per_second": 125.455,
317
+ "eval_steps_per_second": 3.931,
318
+ "step": 105
319
+ },
320
+ {
321
+ "epoch": 1.43,
322
+ "eval_loss": 0.7041762471199036,
323
+ "eval_runtime": 23.9161,
324
+ "eval_samples_per_second": 125.439,
325
+ "eval_steps_per_second": 3.93,
326
+ "step": 108
327
+ },
328
+ {
329
+ "epoch": 1.46,
330
+ "eval_loss": 0.7013522982597351,
331
+ "eval_runtime": 23.9133,
332
+ "eval_samples_per_second": 125.453,
333
+ "eval_steps_per_second": 3.931,
334
+ "step": 111
335
+ },
336
+ {
337
+ "epoch": 1.5,
338
+ "eval_loss": 0.6989504098892212,
339
+ "eval_runtime": 23.9152,
340
+ "eval_samples_per_second": 125.443,
341
+ "eval_steps_per_second": 3.931,
342
+ "step": 114
343
+ },
344
+ {
345
+ "epoch": 1.54,
346
+ "eval_loss": 0.6974085569381714,
347
+ "eval_runtime": 23.9561,
348
+ "eval_samples_per_second": 125.229,
349
+ "eval_steps_per_second": 3.924,
350
+ "step": 117
351
+ },
352
+ {
353
+ "epoch": 1.58,
354
+ "learning_rate": 0.0001695827725437416,
355
+ "loss": 0.7141,
356
+ "step": 120
357
+ },
358
+ {
359
+ "epoch": 1.58,
360
+ "eval_loss": 0.6944894194602966,
361
+ "eval_runtime": 23.902,
362
+ "eval_samples_per_second": 125.512,
363
+ "eval_steps_per_second": 3.933,
364
+ "step": 120
365
+ },
366
+ {
367
+ "epoch": 1.62,
368
+ "eval_loss": 0.6929482221603394,
369
+ "eval_runtime": 23.9189,
370
+ "eval_samples_per_second": 125.424,
371
+ "eval_steps_per_second": 3.93,
372
+ "step": 123
373
+ },
374
+ {
375
+ "epoch": 1.66,
376
+ "eval_loss": 0.6903366446495056,
377
+ "eval_runtime": 23.9061,
378
+ "eval_samples_per_second": 125.491,
379
+ "eval_steps_per_second": 3.932,
380
+ "step": 126
381
+ },
382
+ {
383
+ "epoch": 1.7,
384
+ "eval_loss": 0.6882749199867249,
385
+ "eval_runtime": 23.9181,
386
+ "eval_samples_per_second": 125.428,
387
+ "eval_steps_per_second": 3.93,
388
+ "step": 129
389
+ },
390
+ {
391
+ "epoch": 1.74,
392
+ "eval_loss": 0.6863100528717041,
393
+ "eval_runtime": 23.914,
394
+ "eval_samples_per_second": 125.45,
395
+ "eval_steps_per_second": 3.931,
396
+ "step": 132
397
+ },
398
+ {
399
+ "epoch": 1.78,
400
+ "eval_loss": 0.6860549449920654,
401
+ "eval_runtime": 23.9138,
402
+ "eval_samples_per_second": 125.45,
403
+ "eval_steps_per_second": 3.931,
404
+ "step": 135
405
+ },
406
+ {
407
+ "epoch": 1.82,
408
+ "eval_loss": 0.6831715703010559,
409
+ "eval_runtime": 23.9135,
410
+ "eval_samples_per_second": 125.452,
411
+ "eval_steps_per_second": 3.931,
412
+ "step": 138
413
+ },
414
+ {
415
+ "epoch": 1.85,
416
+ "learning_rate": 0.0001641991924629879,
417
+ "loss": 0.6902,
418
+ "step": 140
419
+ },
420
+ {
421
+ "epoch": 1.86,
422
+ "eval_loss": 0.6819499731063843,
423
+ "eval_runtime": 23.8986,
424
+ "eval_samples_per_second": 125.53,
425
+ "eval_steps_per_second": 3.933,
426
+ "step": 141
427
+ },
428
+ {
429
+ "epoch": 1.9,
430
+ "eval_loss": 0.6807693839073181,
431
+ "eval_runtime": 23.9169,
432
+ "eval_samples_per_second": 125.434,
433
+ "eval_steps_per_second": 3.93,
434
+ "step": 144
435
+ },
436
+ {
437
+ "epoch": 1.94,
438
+ "eval_loss": 0.6787669062614441,
439
+ "eval_runtime": 23.9265,
440
+ "eval_samples_per_second": 125.384,
441
+ "eval_steps_per_second": 3.929,
442
+ "step": 147
443
+ },
444
+ {
445
+ "epoch": 1.98,
446
+ "eval_loss": 0.6773442625999451,
447
+ "eval_runtime": 23.9274,
448
+ "eval_samples_per_second": 125.38,
449
+ "eval_steps_per_second": 3.929,
450
+ "step": 150
451
+ },
452
+ {
453
+ "epoch": 2.02,
454
+ "eval_loss": 0.6759281158447266,
455
+ "eval_runtime": 23.9386,
456
+ "eval_samples_per_second": 125.321,
457
+ "eval_steps_per_second": 3.927,
458
+ "step": 153
459
+ },
460
+ {
461
+ "epoch": 2.06,
462
+ "eval_loss": 0.6743582487106323,
463
+ "eval_runtime": 23.9323,
464
+ "eval_samples_per_second": 125.354,
465
+ "eval_steps_per_second": 3.928,
466
+ "step": 156
467
+ },
468
+ {
469
+ "epoch": 2.1,
470
+ "eval_loss": 0.6732926368713379,
471
+ "eval_runtime": 23.9145,
472
+ "eval_samples_per_second": 125.447,
473
+ "eval_steps_per_second": 3.931,
474
+ "step": 159
475
+ },
476
+ {
477
+ "epoch": 2.11,
478
+ "learning_rate": 0.0001588156123822342,
479
+ "loss": 0.6766,
480
+ "step": 160
481
+ },
482
+ {
483
+ "epoch": 2.14,
484
+ "eval_loss": 0.6721953749656677,
485
+ "eval_runtime": 23.9073,
486
+ "eval_samples_per_second": 125.485,
487
+ "eval_steps_per_second": 3.932,
488
+ "step": 162
489
+ },
490
+ {
491
+ "epoch": 2.18,
492
+ "eval_loss": 0.6714429259300232,
493
+ "eval_runtime": 23.8955,
494
+ "eval_samples_per_second": 125.547,
495
+ "eval_steps_per_second": 3.934,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 2.22,
500
+ "eval_loss": 0.670035183429718,
501
+ "eval_runtime": 23.9431,
502
+ "eval_samples_per_second": 125.297,
503
+ "eval_steps_per_second": 3.926,
504
+ "step": 168
505
+ },
506
+ {
507
+ "epoch": 2.26,
508
+ "eval_loss": 0.6695354580879211,
509
+ "eval_runtime": 23.8875,
510
+ "eval_samples_per_second": 125.589,
511
+ "eval_steps_per_second": 3.935,
512
+ "step": 171
513
+ },
514
+ {
515
+ "epoch": 2.3,
516
+ "eval_loss": 0.6689226031303406,
517
+ "eval_runtime": 23.9185,
518
+ "eval_samples_per_second": 125.426,
519
+ "eval_steps_per_second": 3.93,
520
+ "step": 174
521
+ },
522
+ {
523
+ "epoch": 2.34,
524
+ "eval_loss": 0.6674054861068726,
525
+ "eval_runtime": 23.941,
526
+ "eval_samples_per_second": 125.308,
527
+ "eval_steps_per_second": 3.926,
528
+ "step": 177
529
+ },
530
+ {
531
+ "epoch": 2.38,
532
+ "learning_rate": 0.00015343203230148048,
533
+ "loss": 0.6743,
534
+ "step": 180
535
+ },
536
+ {
537
+ "epoch": 2.38,
538
+ "eval_loss": 0.6664847731590271,
539
+ "eval_runtime": 23.9211,
540
+ "eval_samples_per_second": 125.412,
541
+ "eval_steps_per_second": 3.93,
542
+ "step": 180
543
+ },
544
+ {
545
+ "epoch": 2.41,
546
+ "eval_loss": 0.6658627986907959,
547
+ "eval_runtime": 23.9247,
548
+ "eval_samples_per_second": 125.394,
549
+ "eval_steps_per_second": 3.929,
550
+ "step": 183
551
+ },
552
+ {
553
+ "epoch": 2.45,
554
+ "eval_loss": 0.664908766746521,
555
+ "eval_runtime": 23.9272,
556
+ "eval_samples_per_second": 125.38,
557
+ "eval_steps_per_second": 3.929,
558
+ "step": 186
559
+ },
560
+ {
561
+ "epoch": 2.49,
562
+ "eval_loss": 0.6638036966323853,
563
+ "eval_runtime": 23.9187,
564
+ "eval_samples_per_second": 125.425,
565
+ "eval_steps_per_second": 3.93,
566
+ "step": 189
567
+ },
568
+ {
569
+ "epoch": 2.53,
570
+ "eval_loss": 0.6625837683677673,
571
+ "eval_runtime": 23.9033,
572
+ "eval_samples_per_second": 125.506,
573
+ "eval_steps_per_second": 3.933,
574
+ "step": 192
575
+ },
576
+ {
577
+ "epoch": 2.57,
578
+ "eval_loss": 0.6619511842727661,
579
+ "eval_runtime": 23.8973,
580
+ "eval_samples_per_second": 125.537,
581
+ "eval_steps_per_second": 3.934,
582
+ "step": 195
583
+ },
584
+ {
585
+ "epoch": 2.61,
586
+ "eval_loss": 0.6611769199371338,
587
+ "eval_runtime": 23.9129,
588
+ "eval_samples_per_second": 125.455,
589
+ "eval_steps_per_second": 3.931,
590
+ "step": 198
591
+ },
592
+ {
593
+ "epoch": 2.64,
594
+ "learning_rate": 0.00014804845222072678,
595
+ "loss": 0.6615,
596
+ "step": 200
597
+ },
598
+ {
599
+ "epoch": 2.65,
600
+ "eval_loss": 0.6606143116950989,
601
+ "eval_runtime": 23.9126,
602
+ "eval_samples_per_second": 125.457,
603
+ "eval_steps_per_second": 3.931,
604
+ "step": 201
605
+ },
606
+ {
607
+ "epoch": 2.69,
608
+ "eval_loss": 0.6589743494987488,
609
+ "eval_runtime": 23.9135,
610
+ "eval_samples_per_second": 125.452,
611
+ "eval_steps_per_second": 3.931,
612
+ "step": 204
613
+ },
614
+ {
615
+ "epoch": 2.73,
616
+ "eval_loss": 0.6578481197357178,
617
+ "eval_runtime": 23.9217,
618
+ "eval_samples_per_second": 125.409,
619
+ "eval_steps_per_second": 3.929,
620
+ "step": 207
621
+ },
622
+ {
623
+ "epoch": 2.77,
624
+ "eval_loss": 0.6571096181869507,
625
+ "eval_runtime": 23.9415,
626
+ "eval_samples_per_second": 125.305,
627
+ "eval_steps_per_second": 3.926,
628
+ "step": 210
629
+ },
630
+ {
631
+ "epoch": 2.81,
632
+ "eval_loss": 0.656689465045929,
633
+ "eval_runtime": 23.9111,
634
+ "eval_samples_per_second": 125.465,
635
+ "eval_steps_per_second": 3.931,
636
+ "step": 213
637
+ },
638
+ {
639
+ "epoch": 2.85,
640
+ "eval_loss": 0.6556207537651062,
641
+ "eval_runtime": 23.9099,
642
+ "eval_samples_per_second": 125.471,
643
+ "eval_steps_per_second": 3.931,
644
+ "step": 216
645
+ },
646
+ {
647
+ "epoch": 2.89,
648
+ "eval_loss": 0.6546627283096313,
649
+ "eval_runtime": 23.9164,
650
+ "eval_samples_per_second": 125.437,
651
+ "eval_steps_per_second": 3.93,
652
+ "step": 219
653
+ },
654
+ {
655
+ "epoch": 2.9,
656
+ "learning_rate": 0.0001426648721399731,
657
+ "loss": 0.6564,
658
+ "step": 220
659
+ },
660
+ {
661
+ "epoch": 2.93,
662
+ "eval_loss": 0.6539400815963745,
663
+ "eval_runtime": 23.906,
664
+ "eval_samples_per_second": 125.492,
665
+ "eval_steps_per_second": 3.932,
666
+ "step": 222
667
+ },
668
+ {
669
+ "epoch": 2.97,
670
+ "eval_loss": 0.653684675693512,
671
+ "eval_runtime": 23.9251,
672
+ "eval_samples_per_second": 125.391,
673
+ "eval_steps_per_second": 3.929,
674
+ "step": 225
675
+ },
676
+ {
677
+ "epoch": 3.01,
678
+ "eval_loss": 0.6526629328727722,
679
+ "eval_runtime": 23.9289,
680
+ "eval_samples_per_second": 125.371,
681
+ "eval_steps_per_second": 3.928,
682
+ "step": 228
683
+ },
684
+ {
685
+ "epoch": 3.05,
686
+ "eval_loss": 0.6525079011917114,
687
+ "eval_runtime": 23.9193,
688
+ "eval_samples_per_second": 125.421,
689
+ "eval_steps_per_second": 3.93,
690
+ "step": 231
691
+ },
692
+ {
693
+ "epoch": 3.09,
694
+ "eval_loss": 0.6514959931373596,
695
+ "eval_runtime": 23.9574,
696
+ "eval_samples_per_second": 125.223,
697
+ "eval_steps_per_second": 3.924,
698
+ "step": 234
699
+ },
700
+ {
701
+ "epoch": 3.13,
702
+ "eval_loss": 0.6507047414779663,
703
+ "eval_runtime": 23.9234,
704
+ "eval_samples_per_second": 125.4,
705
+ "eval_steps_per_second": 3.929,
706
+ "step": 237
707
+ },
708
+ {
709
+ "epoch": 3.17,
710
+ "learning_rate": 0.00013728129205921937,
711
+ "loss": 0.6469,
712
+ "step": 240
713
+ },
714
+ {
715
+ "epoch": 3.17,
716
+ "eval_loss": 0.6504186391830444,
717
+ "eval_runtime": 23.937,
718
+ "eval_samples_per_second": 125.329,
719
+ "eval_steps_per_second": 3.927,
720
+ "step": 240
721
+ },
722
+ {
723
+ "epoch": 3.21,
724
+ "eval_loss": 0.6495808959007263,
725
+ "eval_runtime": 23.9188,
726
+ "eval_samples_per_second": 125.425,
727
+ "eval_steps_per_second": 3.93,
728
+ "step": 243
729
+ },
730
+ {
731
+ "epoch": 3.25,
732
+ "eval_loss": 0.649512529373169,
733
+ "eval_runtime": 23.9209,
734
+ "eval_samples_per_second": 125.413,
735
+ "eval_steps_per_second": 3.93,
736
+ "step": 246
737
+ },
738
+ {
739
+ "epoch": 3.29,
740
+ "eval_loss": 0.648629903793335,
741
+ "eval_runtime": 23.9137,
742
+ "eval_samples_per_second": 125.451,
743
+ "eval_steps_per_second": 3.931,
744
+ "step": 249
745
+ },
746
+ {
747
+ "epoch": 3.33,
748
+ "eval_loss": 0.6480894088745117,
749
+ "eval_runtime": 23.919,
750
+ "eval_samples_per_second": 125.423,
751
+ "eval_steps_per_second": 3.93,
752
+ "step": 252
753
+ },
754
+ {
755
+ "epoch": 3.36,
756
+ "eval_loss": 0.6474400758743286,
757
+ "eval_runtime": 23.9076,
758
+ "eval_samples_per_second": 125.483,
759
+ "eval_steps_per_second": 3.932,
760
+ "step": 255
761
+ },
762
+ {
763
+ "epoch": 3.4,
764
+ "eval_loss": 0.6468291878700256,
765
+ "eval_runtime": 23.9305,
766
+ "eval_samples_per_second": 125.363,
767
+ "eval_steps_per_second": 3.928,
768
+ "step": 258
769
+ },
770
+ {
771
+ "epoch": 3.43,
772
+ "learning_rate": 0.00013189771197846567,
773
+ "loss": 0.6463,
774
+ "step": 260
775
+ },
776
+ {
777
+ "epoch": 3.44,
778
+ "eval_loss": 0.6462663412094116,
779
+ "eval_runtime": 23.9359,
780
+ "eval_samples_per_second": 125.335,
781
+ "eval_steps_per_second": 3.927,
782
+ "step": 261
783
+ },
784
+ {
785
+ "epoch": 3.48,
786
+ "eval_loss": 0.6458565592765808,
787
+ "eval_runtime": 23.929,
788
+ "eval_samples_per_second": 125.371,
789
+ "eval_steps_per_second": 3.928,
790
+ "step": 264
791
+ },
792
+ {
793
+ "epoch": 3.52,
794
+ "eval_loss": 0.645412266254425,
795
+ "eval_runtime": 23.9362,
796
+ "eval_samples_per_second": 125.333,
797
+ "eval_steps_per_second": 3.927,
798
+ "step": 267
799
+ },
800
+ {
801
+ "epoch": 3.56,
802
+ "eval_loss": 0.6449554562568665,
803
+ "eval_runtime": 23.9004,
804
+ "eval_samples_per_second": 125.521,
805
+ "eval_steps_per_second": 3.933,
806
+ "step": 270
807
+ },
808
+ {
809
+ "epoch": 3.6,
810
+ "eval_loss": 0.6443325281143188,
811
+ "eval_runtime": 23.9065,
812
+ "eval_samples_per_second": 125.489,
813
+ "eval_steps_per_second": 3.932,
814
+ "step": 273
815
+ },
816
+ {
817
+ "epoch": 3.64,
818
+ "eval_loss": 0.6435034871101379,
819
+ "eval_runtime": 23.9072,
820
+ "eval_samples_per_second": 125.485,
821
+ "eval_steps_per_second": 3.932,
822
+ "step": 276
823
+ },
824
+ {
825
+ "epoch": 3.68,
826
+ "eval_loss": 0.6433733701705933,
827
+ "eval_runtime": 23.9042,
828
+ "eval_samples_per_second": 125.501,
829
+ "eval_steps_per_second": 3.932,
830
+ "step": 279
831
+ },
832
+ {
833
+ "epoch": 3.69,
834
+ "learning_rate": 0.00012651413189771198,
835
+ "loss": 0.6389,
836
+ "step": 280
837
+ },
838
+ {
839
+ "epoch": 3.72,
840
+ "eval_loss": 0.6425070762634277,
841
+ "eval_runtime": 23.8874,
842
+ "eval_samples_per_second": 125.589,
843
+ "eval_steps_per_second": 3.935,
844
+ "step": 282
845
+ },
846
+ {
847
+ "epoch": 3.76,
848
+ "eval_loss": 0.642119288444519,
849
+ "eval_runtime": 23.9328,
850
+ "eval_samples_per_second": 125.351,
851
+ "eval_steps_per_second": 3.928,
852
+ "step": 285
853
+ },
854
+ {
855
+ "epoch": 3.8,
856
+ "eval_loss": 0.641748309135437,
857
+ "eval_runtime": 23.9294,
858
+ "eval_samples_per_second": 125.369,
859
+ "eval_steps_per_second": 3.928,
860
+ "step": 288
861
+ },
862
+ {
863
+ "epoch": 3.84,
864
+ "eval_loss": 0.640826404094696,
865
+ "eval_runtime": 23.9434,
866
+ "eval_samples_per_second": 125.296,
867
+ "eval_steps_per_second": 3.926,
868
+ "step": 291
869
+ },
870
+ {
871
+ "epoch": 3.88,
872
+ "eval_loss": 0.6402388215065002,
873
+ "eval_runtime": 23.9162,
874
+ "eval_samples_per_second": 125.438,
875
+ "eval_steps_per_second": 3.93,
876
+ "step": 294
877
+ },
878
+ {
879
+ "epoch": 3.92,
880
+ "eval_loss": 0.6407353281974792,
881
+ "eval_runtime": 23.9121,
882
+ "eval_samples_per_second": 125.46,
883
+ "eval_steps_per_second": 3.931,
884
+ "step": 297
885
+ },
886
+ {
887
+ "epoch": 3.96,
888
+ "learning_rate": 0.0001211305518169583,
889
+ "loss": 0.6318,
890
+ "step": 300
891
+ },
892
+ {
893
+ "epoch": 3.96,
894
+ "eval_loss": 0.6398600935935974,
895
+ "eval_runtime": 23.9229,
896
+ "eval_samples_per_second": 125.403,
897
+ "eval_steps_per_second": 3.929,
898
+ "step": 300
899
+ },
900
+ {
901
+ "epoch": 4.0,
902
+ "eval_loss": 0.6393464207649231,
903
+ "eval_runtime": 23.9187,
904
+ "eval_samples_per_second": 125.425,
905
+ "eval_steps_per_second": 3.93,
906
+ "step": 303
907
+ },
908
+ {
909
+ "epoch": 4.04,
910
+ "eval_loss": 0.6392526626586914,
911
+ "eval_runtime": 23.9074,
912
+ "eval_samples_per_second": 125.484,
913
+ "eval_steps_per_second": 3.932,
914
+ "step": 306
915
+ },
916
+ {
917
+ "epoch": 4.08,
918
+ "eval_loss": 0.6389594078063965,
919
+ "eval_runtime": 23.918,
920
+ "eval_samples_per_second": 125.428,
921
+ "eval_steps_per_second": 3.93,
922
+ "step": 309
923
+ },
924
+ {
925
+ "epoch": 4.12,
926
+ "eval_loss": 0.6388808488845825,
927
+ "eval_runtime": 23.9158,
928
+ "eval_samples_per_second": 125.44,
929
+ "eval_steps_per_second": 3.93,
930
+ "step": 312
931
+ },
932
+ {
933
+ "epoch": 4.16,
934
+ "eval_loss": 0.6384025812149048,
935
+ "eval_runtime": 23.9176,
936
+ "eval_samples_per_second": 125.431,
937
+ "eval_steps_per_second": 3.93,
938
+ "step": 315
939
+ },
940
+ {
941
+ "epoch": 4.2,
942
+ "eval_loss": 0.6387144923210144,
943
+ "eval_runtime": 23.9047,
944
+ "eval_samples_per_second": 125.498,
945
+ "eval_steps_per_second": 3.932,
946
+ "step": 318
947
+ },
948
+ {
949
+ "epoch": 4.22,
950
+ "learning_rate": 0.00011574697173620459,
951
+ "loss": 0.6277,
952
+ "step": 320
953
+ },
954
+ {
955
+ "epoch": 4.24,
956
+ "eval_loss": 0.6377059817314148,
957
+ "eval_runtime": 23.9246,
958
+ "eval_samples_per_second": 125.394,
959
+ "eval_steps_per_second": 3.929,
960
+ "step": 321
961
+ },
962
+ {
963
+ "epoch": 4.28,
964
+ "eval_loss": 0.636981189250946,
965
+ "eval_runtime": 23.9459,
966
+ "eval_samples_per_second": 125.283,
967
+ "eval_steps_per_second": 3.926,
968
+ "step": 324
969
+ },
970
+ {
971
+ "epoch": 4.32,
972
+ "eval_loss": 0.6364036202430725,
973
+ "eval_runtime": 23.9206,
974
+ "eval_samples_per_second": 125.415,
975
+ "eval_steps_per_second": 3.93,
976
+ "step": 327
977
+ },
978
+ {
979
+ "epoch": 4.35,
980
+ "eval_loss": 0.6357031464576721,
981
+ "eval_runtime": 23.9187,
982
+ "eval_samples_per_second": 125.425,
983
+ "eval_steps_per_second": 3.93,
984
+ "step": 330
985
+ },
986
+ {
987
+ "epoch": 4.39,
988
+ "eval_loss": 0.6366411447525024,
989
+ "eval_runtime": 23.9159,
990
+ "eval_samples_per_second": 125.44,
991
+ "eval_steps_per_second": 3.93,
992
+ "step": 333
993
+ },
994
+ {
995
+ "epoch": 4.43,
996
+ "eval_loss": 0.6357526183128357,
997
+ "eval_runtime": 23.9135,
998
+ "eval_samples_per_second": 125.452,
999
+ "eval_steps_per_second": 3.931,
1000
+ "step": 336
1001
+ },
1002
+ {
1003
+ "epoch": 4.47,
1004
+ "eval_loss": 0.6349912881851196,
1005
+ "eval_runtime": 23.9211,
1006
+ "eval_samples_per_second": 125.412,
1007
+ "eval_steps_per_second": 3.93,
1008
+ "step": 339
1009
+ },
1010
+ {
1011
+ "epoch": 4.49,
1012
+ "learning_rate": 0.00011036339165545088,
1013
+ "loss": 0.6303,
1014
+ "step": 340
1015
+ },
1016
+ {
1017
+ "epoch": 4.51,
1018
+ "eval_loss": 0.6343324184417725,
1019
+ "eval_runtime": 23.927,
1020
+ "eval_samples_per_second": 125.381,
1021
+ "eval_steps_per_second": 3.929,
1022
+ "step": 342
1023
+ },
1024
+ {
1025
+ "epoch": 4.55,
1026
+ "eval_loss": 0.6347218751907349,
1027
+ "eval_runtime": 23.9489,
1028
+ "eval_samples_per_second": 125.267,
1029
+ "eval_steps_per_second": 3.925,
1030
+ "step": 345
1031
+ },
1032
+ {
1033
+ "epoch": 4.59,
1034
+ "eval_loss": 0.6333290338516235,
1035
+ "eval_runtime": 23.9573,
1036
+ "eval_samples_per_second": 125.223,
1037
+ "eval_steps_per_second": 3.924,
1038
+ "step": 348
1039
+ },
1040
+ {
1041
+ "epoch": 4.63,
1042
+ "eval_loss": 0.6328045129776001,
1043
+ "eval_runtime": 23.925,
1044
+ "eval_samples_per_second": 125.392,
1045
+ "eval_steps_per_second": 3.929,
1046
+ "step": 351
1047
+ },
1048
+ {
1049
+ "epoch": 4.67,
1050
+ "eval_loss": 0.6328830718994141,
1051
+ "eval_runtime": 23.9277,
1052
+ "eval_samples_per_second": 125.378,
1053
+ "eval_steps_per_second": 3.928,
1054
+ "step": 354
1055
+ },
1056
+ {
1057
+ "epoch": 4.71,
1058
+ "eval_loss": 0.6323109269142151,
1059
+ "eval_runtime": 23.9385,
1060
+ "eval_samples_per_second": 125.321,
1061
+ "eval_steps_per_second": 3.927,
1062
+ "step": 357
1063
+ },
1064
+ {
1065
+ "epoch": 4.75,
1066
+ "learning_rate": 0.00010497981157469719,
1067
+ "loss": 0.6268,
1068
+ "step": 360
1069
+ },
1070
+ {
1071
+ "epoch": 4.75,
1072
+ "eval_loss": 0.6327587366104126,
1073
+ "eval_runtime": 23.9389,
1074
+ "eval_samples_per_second": 125.319,
1075
+ "eval_steps_per_second": 3.927,
1076
+ "step": 360
1077
+ },
1078
+ {
1079
+ "epoch": 4.79,
1080
+ "eval_loss": 0.6324266791343689,
1081
+ "eval_runtime": 23.9367,
1082
+ "eval_samples_per_second": 125.331,
1083
+ "eval_steps_per_second": 3.927,
1084
+ "step": 363
1085
+ },
1086
+ {
1087
+ "epoch": 4.83,
1088
+ "eval_loss": 0.6320524215698242,
1089
+ "eval_runtime": 23.9373,
1090
+ "eval_samples_per_second": 125.327,
1091
+ "eval_steps_per_second": 3.927,
1092
+ "step": 366
1093
+ },
1094
+ {
1095
+ "epoch": 4.87,
1096
+ "eval_loss": 0.6314539313316345,
1097
+ "eval_runtime": 23.9325,
1098
+ "eval_samples_per_second": 125.352,
1099
+ "eval_steps_per_second": 3.928,
1100
+ "step": 369
1101
+ },
1102
+ {
1103
+ "epoch": 4.91,
1104
+ "eval_loss": 0.6318089365959167,
1105
+ "eval_runtime": 23.9345,
1106
+ "eval_samples_per_second": 125.342,
1107
+ "eval_steps_per_second": 3.927,
1108
+ "step": 372
1109
+ },
1110
+ {
1111
+ "epoch": 4.95,
1112
+ "eval_loss": 0.6315808296203613,
1113
+ "eval_runtime": 23.924,
1114
+ "eval_samples_per_second": 125.397,
1115
+ "eval_steps_per_second": 3.929,
1116
+ "step": 375
1117
+ },
1118
+ {
1119
+ "epoch": 4.99,
1120
+ "eval_loss": 0.630818247795105,
1121
+ "eval_runtime": 23.9285,
1122
+ "eval_samples_per_second": 125.373,
1123
+ "eval_steps_per_second": 3.928,
1124
+ "step": 378
1125
+ },
1126
+ {
1127
+ "epoch": 5.01,
1128
+ "learning_rate": 9.959623149394348e-05,
1129
+ "loss": 0.6196,
1130
+ "step": 380
1131
+ },
1132
+ {
1133
+ "epoch": 5.03,
1134
+ "eval_loss": 0.630248486995697,
1135
+ "eval_runtime": 23.9231,
1136
+ "eval_samples_per_second": 125.402,
1137
+ "eval_steps_per_second": 3.929,
1138
+ "step": 381
1139
+ },
1140
+ {
1141
+ "epoch": 5.07,
1142
+ "eval_loss": 0.6306143403053284,
1143
+ "eval_runtime": 23.9242,
1144
+ "eval_samples_per_second": 125.396,
1145
+ "eval_steps_per_second": 3.929,
1146
+ "step": 384
1147
+ },
1148
+ {
1149
+ "epoch": 5.11,
1150
+ "eval_loss": 0.6305729746818542,
1151
+ "eval_runtime": 23.9232,
1152
+ "eval_samples_per_second": 125.401,
1153
+ "eval_steps_per_second": 3.929,
1154
+ "step": 387
1155
+ },
1156
+ {
1157
+ "epoch": 5.15,
1158
+ "eval_loss": 0.6302648782730103,
1159
+ "eval_runtime": 23.9286,
1160
+ "eval_samples_per_second": 125.373,
1161
+ "eval_steps_per_second": 3.928,
1162
+ "step": 390
1163
+ },
1164
+ {
1165
+ "epoch": 5.19,
1166
+ "eval_loss": 0.6298710703849792,
1167
+ "eval_runtime": 23.9258,
1168
+ "eval_samples_per_second": 125.388,
1169
+ "eval_steps_per_second": 3.929,
1170
+ "step": 393
1171
+ },
1172
+ {
1173
+ "epoch": 5.23,
1174
+ "eval_loss": 0.6298263669013977,
1175
+ "eval_runtime": 23.9284,
1176
+ "eval_samples_per_second": 125.374,
1177
+ "eval_steps_per_second": 3.928,
1178
+ "step": 396
1179
+ },
1180
+ {
1181
+ "epoch": 5.27,
1182
+ "eval_loss": 0.6292470097541809,
1183
+ "eval_runtime": 23.9269,
1184
+ "eval_samples_per_second": 125.382,
1185
+ "eval_steps_per_second": 3.929,
1186
+ "step": 399
1187
+ },
1188
+ {
1189
+ "epoch": 5.28,
1190
+ "learning_rate": 9.421265141318977e-05,
1191
+ "loss": 0.6146,
1192
+ "step": 400
1193
+ },
1194
+ {
1195
+ "epoch": 5.3,
1196
+ "eval_loss": 0.6291049122810364,
1197
+ "eval_runtime": 23.9297,
1198
+ "eval_samples_per_second": 125.367,
1199
+ "eval_steps_per_second": 3.928,
1200
+ "step": 402
1201
+ },
1202
+ {
1203
+ "epoch": 5.34,
1204
+ "eval_loss": 0.6296722292900085,
1205
+ "eval_runtime": 23.9386,
1206
+ "eval_samples_per_second": 125.321,
1207
+ "eval_steps_per_second": 3.927,
1208
+ "step": 405
1209
+ },
1210
+ {
1211
+ "epoch": 5.38,
1212
+ "eval_loss": 0.6288275122642517,
1213
+ "eval_runtime": 23.9308,
1214
+ "eval_samples_per_second": 125.362,
1215
+ "eval_steps_per_second": 3.928,
1216
+ "step": 408
1217
+ },
1218
+ {
1219
+ "epoch": 5.42,
1220
+ "eval_loss": 0.6288333535194397,
1221
+ "eval_runtime": 23.9261,
1222
+ "eval_samples_per_second": 125.386,
1223
+ "eval_steps_per_second": 3.929,
1224
+ "step": 411
1225
+ },
1226
+ {
1227
+ "epoch": 5.46,
1228
+ "eval_loss": 0.6279690861701965,
1229
+ "eval_runtime": 23.9282,
1230
+ "eval_samples_per_second": 125.375,
1231
+ "eval_steps_per_second": 3.928,
1232
+ "step": 414
1233
+ },
1234
+ {
1235
+ "epoch": 5.5,
1236
+ "eval_loss": 0.6275332570075989,
1237
+ "eval_runtime": 23.9215,
1238
+ "eval_samples_per_second": 125.41,
1239
+ "eval_steps_per_second": 3.93,
1240
+ "step": 417
1241
+ },
1242
+ {
1243
+ "epoch": 5.54,
1244
+ "learning_rate": 8.882907133243608e-05,
1245
+ "loss": 0.6149,
1246
+ "step": 420
1247
+ },
1248
+ {
1249
+ "epoch": 5.54,
1250
+ "eval_loss": 0.6279338598251343,
1251
+ "eval_runtime": 23.93,
1252
+ "eval_samples_per_second": 125.366,
1253
+ "eval_steps_per_second": 3.928,
1254
+ "step": 420
1255
+ },
1256
+ {
1257
+ "epoch": 5.58,
1258
+ "eval_loss": 0.6271057724952698,
1259
+ "eval_runtime": 23.9158,
1260
+ "eval_samples_per_second": 125.44,
1261
+ "eval_steps_per_second": 3.93,
1262
+ "step": 423
1263
+ },
1264
+ {
1265
+ "epoch": 5.62,
1266
+ "eval_loss": 0.6270298361778259,
1267
+ "eval_runtime": 23.9264,
1268
+ "eval_samples_per_second": 125.384,
1269
+ "eval_steps_per_second": 3.929,
1270
+ "step": 426
1271
+ },
1272
+ {
1273
+ "epoch": 5.66,
1274
+ "eval_loss": 0.6271407604217529,
1275
+ "eval_runtime": 23.9362,
1276
+ "eval_samples_per_second": 125.333,
1277
+ "eval_steps_per_second": 3.927,
1278
+ "step": 429
1279
+ },
1280
+ {
1281
+ "epoch": 5.7,
1282
+ "eval_loss": 0.6264240145683289,
1283
+ "eval_runtime": 23.9309,
1284
+ "eval_samples_per_second": 125.361,
1285
+ "eval_steps_per_second": 3.928,
1286
+ "step": 432
1287
+ },
1288
+ {
1289
+ "epoch": 5.74,
1290
+ "eval_loss": 0.6263339519500732,
1291
+ "eval_runtime": 23.93,
1292
+ "eval_samples_per_second": 125.366,
1293
+ "eval_steps_per_second": 3.928,
1294
+ "step": 435
1295
+ },
1296
+ {
1297
+ "epoch": 5.78,
1298
+ "eval_loss": 0.6256468296051025,
1299
+ "eval_runtime": 23.9252,
1300
+ "eval_samples_per_second": 125.391,
1301
+ "eval_steps_per_second": 3.929,
1302
+ "step": 438
1303
+ },
1304
+ {
1305
+ "epoch": 5.81,
1306
+ "learning_rate": 8.344549125168237e-05,
1307
+ "loss": 0.6191,
1308
+ "step": 440
1309
+ },
1310
+ {
1311
+ "epoch": 5.82,
1312
+ "eval_loss": 0.6260586977005005,
1313
+ "eval_runtime": 23.918,
1314
+ "eval_samples_per_second": 125.429,
1315
+ "eval_steps_per_second": 3.93,
1316
+ "step": 441
1317
+ },
1318
+ {
1319
+ "epoch": 5.86,
1320
+ "eval_loss": 0.625337541103363,
1321
+ "eval_runtime": 23.9172,
1322
+ "eval_samples_per_second": 125.433,
1323
+ "eval_steps_per_second": 3.93,
1324
+ "step": 444
1325
+ },
1326
+ {
1327
+ "epoch": 5.9,
1328
+ "eval_loss": 0.6246620416641235,
1329
+ "eval_runtime": 23.908,
1330
+ "eval_samples_per_second": 125.481,
1331
+ "eval_steps_per_second": 3.932,
1332
+ "step": 447
1333
+ },
1334
+ {
1335
+ "epoch": 5.94,
1336
+ "eval_loss": 0.6251673698425293,
1337
+ "eval_runtime": 23.9137,
1338
+ "eval_samples_per_second": 125.451,
1339
+ "eval_steps_per_second": 3.931,
1340
+ "step": 450
1341
+ },
1342
+ {
1343
+ "epoch": 5.98,
1344
+ "eval_loss": 0.6253092288970947,
1345
+ "eval_runtime": 23.9409,
1346
+ "eval_samples_per_second": 125.309,
1347
+ "eval_steps_per_second": 3.926,
1348
+ "step": 453
1349
+ },
1350
+ {
1351
+ "epoch": 6.02,
1352
+ "eval_loss": 0.6245599389076233,
1353
+ "eval_runtime": 23.9233,
1354
+ "eval_samples_per_second": 125.401,
1355
+ "eval_steps_per_second": 3.929,
1356
+ "step": 456
1357
+ },
1358
+ {
1359
+ "epoch": 6.06,
1360
+ "eval_loss": 0.6247097849845886,
1361
+ "eval_runtime": 23.9184,
1362
+ "eval_samples_per_second": 125.426,
1363
+ "eval_steps_per_second": 3.93,
1364
+ "step": 459
1365
+ },
1366
+ {
1367
+ "epoch": 6.07,
1368
+ "learning_rate": 7.806191117092868e-05,
1369
+ "loss": 0.6075,
1370
+ "step": 460
1371
+ },
1372
+ {
1373
+ "epoch": 6.1,
1374
+ "eval_loss": 0.6237714290618896,
1375
+ "eval_runtime": 23.9137,
1376
+ "eval_samples_per_second": 125.451,
1377
+ "eval_steps_per_second": 3.931,
1378
+ "step": 462
1379
+ },
1380
+ {
1381
+ "epoch": 6.14,
1382
+ "eval_loss": 0.6239632964134216,
1383
+ "eval_runtime": 23.9142,
1384
+ "eval_samples_per_second": 125.449,
1385
+ "eval_steps_per_second": 3.931,
1386
+ "step": 465
1387
+ },
1388
+ {
1389
+ "epoch": 6.18,
1390
+ "eval_loss": 0.6246253252029419,
1391
+ "eval_runtime": 23.9199,
1392
+ "eval_samples_per_second": 125.419,
1393
+ "eval_steps_per_second": 3.93,
1394
+ "step": 468
1395
+ },
1396
+ {
1397
+ "epoch": 6.22,
1398
+ "eval_loss": 0.6236398220062256,
1399
+ "eval_runtime": 23.9169,
1400
+ "eval_samples_per_second": 125.434,
1401
+ "eval_steps_per_second": 3.93,
1402
+ "step": 471
1403
+ },
1404
+ {
1405
+ "epoch": 6.25,
1406
+ "eval_loss": 0.6242309808731079,
1407
+ "eval_runtime": 23.9091,
1408
+ "eval_samples_per_second": 125.475,
1409
+ "eval_steps_per_second": 3.932,
1410
+ "step": 474
1411
+ },
1412
+ {
1413
+ "epoch": 6.29,
1414
+ "eval_loss": 0.6236902475357056,
1415
+ "eval_runtime": 23.9117,
1416
+ "eval_samples_per_second": 125.462,
1417
+ "eval_steps_per_second": 3.931,
1418
+ "step": 477
1419
+ },
1420
+ {
1421
+ "epoch": 6.33,
1422
+ "learning_rate": 7.267833109017497e-05,
1423
+ "loss": 0.6061,
1424
+ "step": 480
1425
+ },
1426
+ {
1427
+ "epoch": 6.33,
1428
+ "eval_loss": 0.623267650604248,
1429
+ "eval_runtime": 23.9071,
1430
+ "eval_samples_per_second": 125.485,
1431
+ "eval_steps_per_second": 3.932,
1432
+ "step": 480
1433
+ },
1434
+ {
1435
+ "epoch": 6.37,
1436
+ "eval_loss": 0.6238719820976257,
1437
+ "eval_runtime": 23.9206,
1438
+ "eval_samples_per_second": 125.415,
1439
+ "eval_steps_per_second": 3.93,
1440
+ "step": 483
1441
+ },
1442
+ {
1443
+ "epoch": 6.41,
1444
+ "eval_loss": 0.6234752535820007,
1445
+ "eval_runtime": 23.914,
1446
+ "eval_samples_per_second": 125.449,
1447
+ "eval_steps_per_second": 3.931,
1448
+ "step": 486
1449
+ },
1450
+ {
1451
+ "epoch": 6.45,
1452
+ "eval_loss": 0.6228368878364563,
1453
+ "eval_runtime": 23.9087,
1454
+ "eval_samples_per_second": 125.477,
1455
+ "eval_steps_per_second": 3.932,
1456
+ "step": 489
1457
+ },
1458
+ {
1459
+ "epoch": 6.49,
1460
+ "eval_loss": 0.6226744055747986,
1461
+ "eval_runtime": 23.9118,
1462
+ "eval_samples_per_second": 125.461,
1463
+ "eval_steps_per_second": 3.931,
1464
+ "step": 492
1465
+ },
1466
+ {
1467
+ "epoch": 6.53,
1468
+ "eval_loss": 0.622622013092041,
1469
+ "eval_runtime": 23.9341,
1470
+ "eval_samples_per_second": 125.344,
1471
+ "eval_steps_per_second": 3.927,
1472
+ "step": 495
1473
+ },
1474
+ {
1475
+ "epoch": 6.57,
1476
+ "eval_loss": 0.6228298544883728,
1477
+ "eval_runtime": 23.9079,
1478
+ "eval_samples_per_second": 125.482,
1479
+ "eval_steps_per_second": 3.932,
1480
+ "step": 498
1481
+ },
1482
+ {
1483
+ "epoch": 6.6,
1484
+ "learning_rate": 6.729475100942126e-05,
1485
+ "loss": 0.6043,
1486
+ "step": 500
1487
+ },
1488
+ {
1489
+ "epoch": 6.61,
1490
+ "eval_loss": 0.6232237815856934,
1491
+ "eval_runtime": 23.8982,
1492
+ "eval_samples_per_second": 125.533,
1493
+ "eval_steps_per_second": 3.933,
1494
+ "step": 501
1495
+ },
1496
+ {
1497
+ "epoch": 6.65,
1498
+ "eval_loss": 0.6218205690383911,
1499
+ "eval_runtime": 23.9059,
1500
+ "eval_samples_per_second": 125.492,
1501
+ "eval_steps_per_second": 3.932,
1502
+ "step": 504
1503
+ },
1504
+ {
1505
+ "epoch": 6.69,
1506
+ "eval_loss": 0.621903657913208,
1507
+ "eval_runtime": 23.8991,
1508
+ "eval_samples_per_second": 125.528,
1509
+ "eval_steps_per_second": 3.933,
1510
+ "step": 507
1511
+ },
1512
+ {
1513
+ "epoch": 6.73,
1514
+ "eval_loss": 0.622235894203186,
1515
+ "eval_runtime": 23.9024,
1516
+ "eval_samples_per_second": 125.51,
1517
+ "eval_steps_per_second": 3.933,
1518
+ "step": 510
1519
+ },
1520
+ {
1521
+ "epoch": 6.77,
1522
+ "eval_loss": 0.6220830082893372,
1523
+ "eval_runtime": 23.8926,
1524
+ "eval_samples_per_second": 125.562,
1525
+ "eval_steps_per_second": 3.934,
1526
+ "step": 513
1527
+ },
1528
+ {
1529
+ "epoch": 6.81,
1530
+ "eval_loss": 0.6220167875289917,
1531
+ "eval_runtime": 23.8965,
1532
+ "eval_samples_per_second": 125.542,
1533
+ "eval_steps_per_second": 3.934,
1534
+ "step": 516
1535
+ },
1536
+ {
1537
+ "epoch": 6.85,
1538
+ "eval_loss": 0.6222782135009766,
1539
+ "eval_runtime": 23.908,
1540
+ "eval_samples_per_second": 125.481,
1541
+ "eval_steps_per_second": 3.932,
1542
+ "step": 519
1543
+ },
1544
+ {
1545
+ "epoch": 6.86,
1546
+ "learning_rate": 6.191117092866757e-05,
1547
+ "loss": 0.6008,
1548
+ "step": 520
1549
+ },
1550
+ {
1551
+ "epoch": 6.89,
1552
+ "eval_loss": 0.6216304302215576,
1553
+ "eval_runtime": 23.9036,
1554
+ "eval_samples_per_second": 125.504,
1555
+ "eval_steps_per_second": 3.932,
1556
+ "step": 522
1557
+ },
1558
+ {
1559
+ "epoch": 6.93,
1560
+ "eval_loss": 0.6217759847640991,
1561
+ "eval_runtime": 23.9088,
1562
+ "eval_samples_per_second": 125.477,
1563
+ "eval_steps_per_second": 3.932,
1564
+ "step": 525
1565
+ },
1566
+ {
1567
+ "epoch": 6.97,
1568
+ "eval_loss": 0.6214317083358765,
1569
+ "eval_runtime": 23.9177,
1570
+ "eval_samples_per_second": 125.43,
1571
+ "eval_steps_per_second": 3.93,
1572
+ "step": 528
1573
+ },
1574
+ {
1575
+ "epoch": 7.01,
1576
+ "eval_loss": 0.6213416457176208,
1577
+ "eval_runtime": 23.9138,
1578
+ "eval_samples_per_second": 125.451,
1579
+ "eval_steps_per_second": 3.931,
1580
+ "step": 531
1581
+ },
1582
+ {
1583
+ "epoch": 7.05,
1584
+ "eval_loss": 0.6217712163925171,
1585
+ "eval_runtime": 23.9141,
1586
+ "eval_samples_per_second": 125.449,
1587
+ "eval_steps_per_second": 3.931,
1588
+ "step": 534
1589
+ },
1590
+ {
1591
+ "epoch": 7.09,
1592
+ "eval_loss": 0.6215860843658447,
1593
+ "eval_runtime": 23.9145,
1594
+ "eval_samples_per_second": 125.447,
1595
+ "eval_steps_per_second": 3.931,
1596
+ "step": 537
1597
+ },
1598
+ {
1599
+ "epoch": 7.13,
1600
+ "learning_rate": 5.652759084791387e-05,
1601
+ "loss": 0.599,
1602
+ "step": 540
1603
+ },
1604
+ {
1605
+ "epoch": 7.13,
1606
+ "eval_loss": 0.6211041808128357,
1607
+ "eval_runtime": 23.9125,
1608
+ "eval_samples_per_second": 125.457,
1609
+ "eval_steps_per_second": 3.931,
1610
+ "step": 540
1611
+ },
1612
+ {
1613
+ "epoch": 7.17,
1614
+ "eval_loss": 0.6210355758666992,
1615
+ "eval_runtime": 23.911,
1616
+ "eval_samples_per_second": 125.465,
1617
+ "eval_steps_per_second": 3.931,
1618
+ "step": 543
1619
+ },
1620
+ {
1621
+ "epoch": 7.2,
1622
+ "eval_loss": 0.6209889650344849,
1623
+ "eval_runtime": 23.9062,
1624
+ "eval_samples_per_second": 125.491,
1625
+ "eval_steps_per_second": 3.932,
1626
+ "step": 546
1627
+ },
1628
+ {
1629
+ "epoch": 7.24,
1630
+ "eval_loss": 0.6205114126205444,
1631
+ "eval_runtime": 23.9227,
1632
+ "eval_samples_per_second": 125.404,
1633
+ "eval_steps_per_second": 3.929,
1634
+ "step": 549
1635
+ },
1636
+ {
1637
+ "epoch": 7.28,
1638
+ "eval_loss": 0.6204013824462891,
1639
+ "eval_runtime": 23.9146,
1640
+ "eval_samples_per_second": 125.446,
1641
+ "eval_steps_per_second": 3.931,
1642
+ "step": 552
1643
+ },
1644
+ {
1645
+ "epoch": 7.32,
1646
+ "eval_loss": 0.6202988028526306,
1647
+ "eval_runtime": 23.9015,
1648
+ "eval_samples_per_second": 125.515,
1649
+ "eval_steps_per_second": 3.933,
1650
+ "step": 555
1651
+ },
1652
+ {
1653
+ "epoch": 7.36,
1654
+ "eval_loss": 0.6199727654457092,
1655
+ "eval_runtime": 23.9089,
1656
+ "eval_samples_per_second": 125.476,
1657
+ "eval_steps_per_second": 3.932,
1658
+ "step": 558
1659
+ },
1660
+ {
1661
+ "epoch": 7.39,
1662
+ "learning_rate": 5.1144010767160164e-05,
1663
+ "loss": 0.5959,
1664
+ "step": 560
1665
+ },
1666
+ {
1667
+ "epoch": 7.4,
1668
+ "eval_loss": 0.619968831539154,
1669
+ "eval_runtime": 23.9043,
1670
+ "eval_samples_per_second": 125.5,
1671
+ "eval_steps_per_second": 3.932,
1672
+ "step": 561
1673
+ },
1674
+ {
1675
+ "epoch": 7.44,
1676
+ "eval_loss": 0.6202374696731567,
1677
+ "eval_runtime": 23.9117,
1678
+ "eval_samples_per_second": 125.461,
1679
+ "eval_steps_per_second": 3.931,
1680
+ "step": 564
1681
+ },
1682
+ {
1683
+ "epoch": 7.48,
1684
+ "eval_loss": 0.6202066540718079,
1685
+ "eval_runtime": 23.908,
1686
+ "eval_samples_per_second": 125.481,
1687
+ "eval_steps_per_second": 3.932,
1688
+ "step": 567
1689
+ },
1690
+ {
1691
+ "epoch": 7.52,
1692
+ "eval_loss": 0.6198835968971252,
1693
+ "eval_runtime": 23.9244,
1694
+ "eval_samples_per_second": 125.395,
1695
+ "eval_steps_per_second": 3.929,
1696
+ "step": 570
1697
+ },
1698
+ {
1699
+ "epoch": 7.56,
1700
+ "eval_loss": 0.6199198961257935,
1701
+ "eval_runtime": 23.9263,
1702
+ "eval_samples_per_second": 125.385,
1703
+ "eval_steps_per_second": 3.929,
1704
+ "step": 573
1705
+ },
1706
+ {
1707
+ "epoch": 7.6,
1708
+ "eval_loss": 0.6195517182350159,
1709
+ "eval_runtime": 23.9125,
1710
+ "eval_samples_per_second": 125.457,
1711
+ "eval_steps_per_second": 3.931,
1712
+ "step": 576
1713
+ },
1714
+ {
1715
+ "epoch": 7.64,
1716
+ "eval_loss": 0.6192638278007507,
1717
+ "eval_runtime": 23.9168,
1718
+ "eval_samples_per_second": 125.435,
1719
+ "eval_steps_per_second": 3.93,
1720
+ "step": 579
1721
+ },
1722
+ {
1723
+ "epoch": 7.65,
1724
+ "learning_rate": 4.576043068640646e-05,
1725
+ "loss": 0.5922,
1726
+ "step": 580
1727
+ },
1728
+ {
1729
+ "epoch": 7.68,
1730
+ "eval_loss": 0.6196587085723877,
1731
+ "eval_runtime": 23.9305,
1732
+ "eval_samples_per_second": 125.363,
1733
+ "eval_steps_per_second": 3.928,
1734
+ "step": 582
1735
+ },
1736
+ {
1737
+ "epoch": 7.72,
1738
+ "eval_loss": 0.6198856830596924,
1739
+ "eval_runtime": 23.919,
1740
+ "eval_samples_per_second": 125.423,
1741
+ "eval_steps_per_second": 3.93,
1742
+ "step": 585
1743
+ },
1744
+ {
1745
+ "epoch": 7.76,
1746
+ "eval_loss": 0.6196783781051636,
1747
+ "eval_runtime": 23.9042,
1748
+ "eval_samples_per_second": 125.501,
1749
+ "eval_steps_per_second": 3.932,
1750
+ "step": 588
1751
+ },
1752
+ {
1753
+ "epoch": 7.8,
1754
+ "eval_loss": 0.6192678809165955,
1755
+ "eval_runtime": 23.9057,
1756
+ "eval_samples_per_second": 125.493,
1757
+ "eval_steps_per_second": 3.932,
1758
+ "step": 591
1759
+ },
1760
+ {
1761
+ "epoch": 7.84,
1762
+ "eval_loss": 0.6192264556884766,
1763
+ "eval_runtime": 23.909,
1764
+ "eval_samples_per_second": 125.476,
1765
+ "eval_steps_per_second": 3.932,
1766
+ "step": 594
1767
+ },
1768
+ {
1769
+ "epoch": 7.88,
1770
+ "eval_loss": 0.6192458271980286,
1771
+ "eval_runtime": 23.8917,
1772
+ "eval_samples_per_second": 125.567,
1773
+ "eval_steps_per_second": 3.934,
1774
+ "step": 597
1775
+ },
1776
+ {
1777
+ "epoch": 7.92,
1778
+ "learning_rate": 4.037685060565276e-05,
1779
+ "loss": 0.6028,
1780
+ "step": 600
1781
+ },
1782
+ {
1783
+ "epoch": 7.92,
1784
+ "eval_loss": 0.6192883849143982,
1785
+ "eval_runtime": 23.9005,
1786
+ "eval_samples_per_second": 125.521,
1787
+ "eval_steps_per_second": 3.933,
1788
+ "step": 600
1789
+ },
1790
+ {
1791
+ "epoch": 7.96,
1792
+ "eval_loss": 0.6194872856140137,
1793
+ "eval_runtime": 23.9118,
1794
+ "eval_samples_per_second": 125.461,
1795
+ "eval_steps_per_second": 3.931,
1796
+ "step": 603
1797
+ },
1798
+ {
1799
+ "epoch": 8.0,
1800
+ "eval_loss": 0.619368314743042,
1801
+ "eval_runtime": 23.9121,
1802
+ "eval_samples_per_second": 125.46,
1803
+ "eval_steps_per_second": 3.931,
1804
+ "step": 606
1805
+ },
1806
+ {
1807
+ "epoch": 8.04,
1808
+ "eval_loss": 0.6190740466117859,
1809
+ "eval_runtime": 23.9192,
1810
+ "eval_samples_per_second": 125.422,
1811
+ "eval_steps_per_second": 3.93,
1812
+ "step": 609
1813
+ },
1814
+ {
1815
+ "epoch": 8.08,
1816
+ "eval_loss": 0.6190269589424133,
1817
+ "eval_runtime": 23.916,
1818
+ "eval_samples_per_second": 125.439,
1819
+ "eval_steps_per_second": 3.93,
1820
+ "step": 612
1821
+ },
1822
+ {
1823
+ "epoch": 8.12,
1824
+ "eval_loss": 0.6189839839935303,
1825
+ "eval_runtime": 23.9063,
1826
+ "eval_samples_per_second": 125.49,
1827
+ "eval_steps_per_second": 3.932,
1828
+ "step": 615
1829
+ },
1830
+ {
1831
+ "epoch": 8.16,
1832
+ "eval_loss": 0.618523895740509,
1833
+ "eval_runtime": 23.8928,
1834
+ "eval_samples_per_second": 125.561,
1835
+ "eval_steps_per_second": 3.934,
1836
+ "step": 618
1837
+ },
1838
+ {
1839
+ "epoch": 8.18,
1840
+ "learning_rate": 3.499327052489906e-05,
1841
+ "loss": 0.5941,
1842
+ "step": 620
1843
+ },
1844
+ {
1845
+ "epoch": 8.19,
1846
+ "eval_loss": 0.6187476515769958,
1847
+ "eval_runtime": 23.9016,
1848
+ "eval_samples_per_second": 125.515,
1849
+ "eval_steps_per_second": 3.933,
1850
+ "step": 621
1851
+ },
1852
+ {
1853
+ "epoch": 8.23,
1854
+ "eval_loss": 0.6186578869819641,
1855
+ "eval_runtime": 23.9003,
1856
+ "eval_samples_per_second": 125.521,
1857
+ "eval_steps_per_second": 3.933,
1858
+ "step": 624
1859
+ },
1860
+ {
1861
+ "epoch": 8.27,
1862
+ "eval_loss": 0.6190162897109985,
1863
+ "eval_runtime": 23.9155,
1864
+ "eval_samples_per_second": 125.442,
1865
+ "eval_steps_per_second": 3.931,
1866
+ "step": 627
1867
+ },
1868
+ {
1869
+ "epoch": 8.31,
1870
+ "eval_loss": 0.6189883351325989,
1871
+ "eval_runtime": 23.9091,
1872
+ "eval_samples_per_second": 125.475,
1873
+ "eval_steps_per_second": 3.932,
1874
+ "step": 630
1875
+ },
1876
+ {
1877
+ "epoch": 8.35,
1878
+ "eval_loss": 0.6184096932411194,
1879
+ "eval_runtime": 23.9042,
1880
+ "eval_samples_per_second": 125.501,
1881
+ "eval_steps_per_second": 3.932,
1882
+ "step": 633
1883
+ },
1884
+ {
1885
+ "epoch": 8.39,
1886
+ "eval_loss": 0.6180031895637512,
1887
+ "eval_runtime": 23.9189,
1888
+ "eval_samples_per_second": 125.424,
1889
+ "eval_steps_per_second": 3.93,
1890
+ "step": 636
1891
+ },
1892
+ {
1893
+ "epoch": 8.43,
1894
+ "eval_loss": 0.6179867386817932,
1895
+ "eval_runtime": 23.922,
1896
+ "eval_samples_per_second": 125.407,
1897
+ "eval_steps_per_second": 3.929,
1898
+ "step": 639
1899
+ },
1900
+ {
1901
+ "epoch": 8.45,
1902
+ "learning_rate": 2.960969044414536e-05,
1903
+ "loss": 0.5906,
1904
+ "step": 640
1905
+ },
1906
+ {
1907
+ "epoch": 8.47,
1908
+ "eval_loss": 0.6182823777198792,
1909
+ "eval_runtime": 23.9158,
1910
+ "eval_samples_per_second": 125.44,
1911
+ "eval_steps_per_second": 3.93,
1912
+ "step": 642
1913
+ },
1914
+ {
1915
+ "epoch": 8.51,
1916
+ "eval_loss": 0.6179353594779968,
1917
+ "eval_runtime": 23.914,
1918
+ "eval_samples_per_second": 125.45,
1919
+ "eval_steps_per_second": 3.931,
1920
+ "step": 645
1921
+ },
1922
+ {
1923
+ "epoch": 8.55,
1924
+ "eval_loss": 0.6178385615348816,
1925
+ "eval_runtime": 23.9079,
1926
+ "eval_samples_per_second": 125.482,
1927
+ "eval_steps_per_second": 3.932,
1928
+ "step": 648
1929
+ },
1930
+ {
1931
+ "epoch": 8.59,
1932
+ "eval_loss": 0.6179868578910828,
1933
+ "eval_runtime": 23.9189,
1934
+ "eval_samples_per_second": 125.424,
1935
+ "eval_steps_per_second": 3.93,
1936
+ "step": 651
1937
+ },
1938
+ {
1939
+ "epoch": 8.63,
1940
+ "eval_loss": 0.6179595589637756,
1941
+ "eval_runtime": 23.9327,
1942
+ "eval_samples_per_second": 125.352,
1943
+ "eval_steps_per_second": 3.928,
1944
+ "step": 654
1945
+ },
1946
+ {
1947
+ "epoch": 8.67,
1948
+ "eval_loss": 0.6177854537963867,
1949
+ "eval_runtime": 23.9123,
1950
+ "eval_samples_per_second": 125.458,
1951
+ "eval_steps_per_second": 3.931,
1952
+ "step": 657
1953
+ },
1954
+ {
1955
+ "epoch": 8.71,
1956
+ "learning_rate": 2.422611036339166e-05,
1957
+ "loss": 0.5908,
1958
+ "step": 660
1959
+ },
1960
+ {
1961
+ "epoch": 8.71,
1962
+ "eval_loss": 0.6179735064506531,
1963
+ "eval_runtime": 23.9135,
1964
+ "eval_samples_per_second": 125.452,
1965
+ "eval_steps_per_second": 3.931,
1966
+ "step": 660
1967
+ },
1968
+ {
1969
+ "epoch": 8.75,
1970
+ "eval_loss": 0.6180996298789978,
1971
+ "eval_runtime": 23.9038,
1972
+ "eval_samples_per_second": 125.503,
1973
+ "eval_steps_per_second": 3.932,
1974
+ "step": 663
1975
+ },
1976
+ {
1977
+ "epoch": 8.79,
1978
+ "eval_loss": 0.6181532740592957,
1979
+ "eval_runtime": 23.9058,
1980
+ "eval_samples_per_second": 125.492,
1981
+ "eval_steps_per_second": 3.932,
1982
+ "step": 666
1983
+ },
1984
+ {
1985
+ "epoch": 8.83,
1986
+ "eval_loss": 0.6176265478134155,
1987
+ "eval_runtime": 23.9128,
1988
+ "eval_samples_per_second": 125.456,
1989
+ "eval_steps_per_second": 3.931,
1990
+ "step": 669
1991
+ },
1992
+ {
1993
+ "epoch": 8.87,
1994
+ "eval_loss": 0.617388904094696,
1995
+ "eval_runtime": 23.8906,
1996
+ "eval_samples_per_second": 125.572,
1997
+ "eval_steps_per_second": 3.935,
1998
+ "step": 672
1999
+ },
2000
+ {
2001
+ "epoch": 8.91,
2002
+ "eval_loss": 0.6174699068069458,
2003
+ "eval_runtime": 23.9051,
2004
+ "eval_samples_per_second": 125.496,
2005
+ "eval_steps_per_second": 3.932,
2006
+ "step": 675
2007
+ },
2008
+ {
2009
+ "epoch": 8.95,
2010
+ "eval_loss": 0.6174932718276978,
2011
+ "eval_runtime": 23.8969,
2012
+ "eval_samples_per_second": 125.54,
2013
+ "eval_steps_per_second": 3.934,
2014
+ "step": 678
2015
+ },
2016
+ {
2017
+ "epoch": 8.97,
2018
+ "learning_rate": 1.8842530282637954e-05,
2019
+ "loss": 0.5937,
2020
+ "step": 680
2021
+ },
2022
+ {
2023
+ "epoch": 8.99,
2024
+ "eval_loss": 0.6174784898757935,
2025
+ "eval_runtime": 23.9056,
2026
+ "eval_samples_per_second": 125.494,
2027
+ "eval_steps_per_second": 3.932,
2028
+ "step": 681
2029
+ },
2030
+ {
2031
+ "epoch": 9.03,
2032
+ "eval_loss": 0.6175104975700378,
2033
+ "eval_runtime": 23.9136,
2034
+ "eval_samples_per_second": 125.452,
2035
+ "eval_steps_per_second": 3.931,
2036
+ "step": 684
2037
+ },
2038
+ {
2039
+ "epoch": 9.07,
2040
+ "eval_loss": 0.6173563599586487,
2041
+ "eval_runtime": 23.9237,
2042
+ "eval_samples_per_second": 125.399,
2043
+ "eval_steps_per_second": 3.929,
2044
+ "step": 687
2045
+ },
2046
+ {
2047
+ "epoch": 9.11,
2048
+ "eval_loss": 0.6172643899917603,
2049
+ "eval_runtime": 23.9021,
2050
+ "eval_samples_per_second": 125.512,
2051
+ "eval_steps_per_second": 3.933,
2052
+ "step": 690
2053
+ },
2054
+ {
2055
+ "epoch": 9.14,
2056
+ "eval_loss": 0.6172318458557129,
2057
+ "eval_runtime": 23.9242,
2058
+ "eval_samples_per_second": 125.396,
2059
+ "eval_steps_per_second": 3.929,
2060
+ "step": 693
2061
+ },
2062
+ {
2063
+ "epoch": 9.18,
2064
+ "eval_loss": 0.617030680179596,
2065
+ "eval_runtime": 23.9184,
2066
+ "eval_samples_per_second": 125.427,
2067
+ "eval_steps_per_second": 3.93,
2068
+ "step": 696
2069
+ },
2070
+ {
2071
+ "epoch": 9.22,
2072
+ "eval_loss": 0.6169764995574951,
2073
+ "eval_runtime": 23.9104,
2074
+ "eval_samples_per_second": 125.468,
2075
+ "eval_steps_per_second": 3.931,
2076
+ "step": 699
2077
+ },
2078
+ {
2079
+ "epoch": 9.24,
2080
+ "learning_rate": 1.3458950201884254e-05,
2081
+ "loss": 0.5867,
2082
+ "step": 700
2083
+ },
2084
+ {
2085
+ "epoch": 9.26,
2086
+ "eval_loss": 0.6171083450317383,
2087
+ "eval_runtime": 23.9126,
2088
+ "eval_samples_per_second": 125.457,
2089
+ "eval_steps_per_second": 3.931,
2090
+ "step": 702
2091
+ },
2092
+ {
2093
+ "epoch": 9.3,
2094
+ "eval_loss": 0.6171473860740662,
2095
+ "eval_runtime": 23.9015,
2096
+ "eval_samples_per_second": 125.515,
2097
+ "eval_steps_per_second": 3.933,
2098
+ "step": 705
2099
+ },
2100
+ {
2101
+ "epoch": 9.34,
2102
+ "eval_loss": 0.6170982122421265,
2103
+ "eval_runtime": 23.9023,
2104
+ "eval_samples_per_second": 125.511,
2105
+ "eval_steps_per_second": 3.933,
2106
+ "step": 708
2107
+ },
2108
+ {
2109
+ "epoch": 9.38,
2110
+ "eval_loss": 0.6169420480728149,
2111
+ "eval_runtime": 23.9074,
2112
+ "eval_samples_per_second": 125.484,
2113
+ "eval_steps_per_second": 3.932,
2114
+ "step": 711
2115
+ },
2116
+ {
2117
+ "epoch": 9.42,
2118
+ "eval_loss": 0.6168730854988098,
2119
+ "eval_runtime": 23.9047,
2120
+ "eval_samples_per_second": 125.498,
2121
+ "eval_steps_per_second": 3.932,
2122
+ "step": 714
2123
+ },
2124
+ {
2125
+ "epoch": 9.46,
2126
+ "eval_loss": 0.6168663501739502,
2127
+ "eval_runtime": 23.9161,
2128
+ "eval_samples_per_second": 125.439,
2129
+ "eval_steps_per_second": 3.93,
2130
+ "step": 717
2131
+ },
2132
+ {
2133
+ "epoch": 9.5,
2134
+ "learning_rate": 8.075370121130552e-06,
2135
+ "loss": 0.5891,
2136
+ "step": 720
2137
+ },
2138
+ {
2139
+ "epoch": 9.5,
2140
+ "eval_loss": 0.6167708039283752,
2141
+ "eval_runtime": 23.9114,
2142
+ "eval_samples_per_second": 125.463,
2143
+ "eval_steps_per_second": 3.931,
2144
+ "step": 720
2145
+ },
2146
+ {
2147
+ "epoch": 9.54,
2148
+ "eval_loss": 0.6167441010475159,
2149
+ "eval_runtime": 23.9086,
2150
+ "eval_samples_per_second": 125.478,
2151
+ "eval_steps_per_second": 3.932,
2152
+ "step": 723
2153
+ },
2154
+ {
2155
+ "epoch": 9.58,
2156
+ "eval_loss": 0.6168937683105469,
2157
+ "eval_runtime": 23.9122,
2158
+ "eval_samples_per_second": 125.459,
2159
+ "eval_steps_per_second": 3.931,
2160
+ "step": 726
2161
+ },
2162
+ {
2163
+ "epoch": 9.62,
2164
+ "eval_loss": 0.6170821189880371,
2165
+ "eval_runtime": 23.9065,
2166
+ "eval_samples_per_second": 125.489,
2167
+ "eval_steps_per_second": 3.932,
2168
+ "step": 729
2169
+ },
2170
+ {
2171
+ "epoch": 9.66,
2172
+ "eval_loss": 0.6171652674674988,
2173
+ "eval_runtime": 23.9131,
2174
+ "eval_samples_per_second": 125.454,
2175
+ "eval_steps_per_second": 3.931,
2176
+ "step": 732
2177
+ },
2178
+ {
2179
+ "epoch": 9.7,
2180
+ "eval_loss": 0.6171714663505554,
2181
+ "eval_runtime": 23.9057,
2182
+ "eval_samples_per_second": 125.493,
2183
+ "eval_steps_per_second": 3.932,
2184
+ "step": 735
2185
+ },
2186
+ {
2187
+ "epoch": 9.74,
2188
+ "eval_loss": 0.6170787811279297,
2189
+ "eval_runtime": 23.928,
2190
+ "eval_samples_per_second": 125.376,
2191
+ "eval_steps_per_second": 3.928,
2192
+ "step": 738
2193
+ }
2194
+ ],
2195
+ "max_steps": 750,
2196
+ "num_train_epochs": 10,
2197
+ "total_flos": 7.798891163997438e+18,
2198
+ "trial_name": null,
2199
+ "trial_params": null
2200
+ }
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-738/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d5a386ce8addef927bca0e390e9534e0877a3e4e00f222f83967dd78c49527a
3
+ size 4027
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/adapter_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "/mnt/data1/sheshuaijie/Data/PLM/vicuna-7b",
3
+ "bias": "none",
4
+ "fan_in_fan_out": false,
5
+ "inference_mode": true,
6
+ "init_lora_weights": true,
7
+ "lora_alpha": 32,
8
+ "lora_dropout": 0.1,
9
+ "modules_to_save": null,
10
+ "peft_type": "LORA",
11
+ "r": 32,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "task_type": "CAUSAL_LM"
17
+ }
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406
3
+ size 443
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cb805ac62e415da1c15d91797d4355c8080bdcfc7c7781a4ed1539877a0af57
3
+ size 134293701
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0957395ee23cae5537ca68fdb8f5f3c0ba7ef97e714bd742f388719a9c5cd4c5
3
+ size 67154893
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e292c6ba42d09ed37e3998fc80693007d182b35ac775e2161f534d0f9f083131
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7c10e4824a89fe497107b1825bf1cfa4fbb78538f2193d56151fad9e2ee99bc
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba511621fdf523ff7989a294ae5130ea54f3b14afd891da2e41bf244e8a1cfc5
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c91d9cec94c7ef784b2b57a2a410bb4b17e31551753c577ee7e48e81791c2057
3
+ size 17655
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f40d0f980607497195b40bb30daf589ddd80f47c884eb1a2231e665f009933b
3
+ size 557
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1b48f5991e61a145c837cbe5f50a0347f208c7ef9ad1718e593fddf77a205b0
3
+ size 627
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "model_max_length": 1000000000000000019884624838656,
22
+ "pad_token": null,
23
+ "sp_model_kwargs": {},
24
+ "tokenizer_class": "LlamaTokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/trainer_state.json ADDED
@@ -0,0 +1,2214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6167441010475159,
3
+ "best_model_checkpoint": "/mnt/data1/sheshuaijie/Output/CoT/Trained/vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-723",
4
+ "epoch": 9.778144329896907,
5
+ "global_step": 741,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "eval_loss": 1.7524008750915527,
13
+ "eval_runtime": 23.7951,
14
+ "eval_samples_per_second": 126.077,
15
+ "eval_steps_per_second": 3.95,
16
+ "step": 3
17
+ },
18
+ {
19
+ "epoch": 0.08,
20
+ "eval_loss": 1.5515066385269165,
21
+ "eval_runtime": 23.828,
22
+ "eval_samples_per_second": 125.902,
23
+ "eval_steps_per_second": 3.945,
24
+ "step": 6
25
+ },
26
+ {
27
+ "epoch": 0.12,
28
+ "eval_loss": 1.3584641218185425,
29
+ "eval_runtime": 23.8775,
30
+ "eval_samples_per_second": 125.641,
31
+ "eval_steps_per_second": 3.937,
32
+ "step": 9
33
+ },
34
+ {
35
+ "epoch": 0.16,
36
+ "eval_loss": 1.2644726037979126,
37
+ "eval_runtime": 23.8942,
38
+ "eval_samples_per_second": 125.554,
39
+ "eval_steps_per_second": 3.934,
40
+ "step": 12
41
+ },
42
+ {
43
+ "epoch": 0.2,
44
+ "eval_loss": 1.166400671005249,
45
+ "eval_runtime": 23.9181,
46
+ "eval_samples_per_second": 125.428,
47
+ "eval_steps_per_second": 3.93,
48
+ "step": 15
49
+ },
50
+ {
51
+ "epoch": 0.24,
52
+ "eval_loss": 1.1086052656173706,
53
+ "eval_runtime": 23.9166,
54
+ "eval_samples_per_second": 125.436,
55
+ "eval_steps_per_second": 3.93,
56
+ "step": 18
57
+ },
58
+ {
59
+ "epoch": 0.26,
60
+ "learning_rate": 0.00019650067294751011,
61
+ "loss": 1.4265,
62
+ "step": 20
63
+ },
64
+ {
65
+ "epoch": 0.28,
66
+ "eval_loss": 1.0677987337112427,
67
+ "eval_runtime": 23.9189,
68
+ "eval_samples_per_second": 125.424,
69
+ "eval_steps_per_second": 3.93,
70
+ "step": 21
71
+ },
72
+ {
73
+ "epoch": 0.32,
74
+ "eval_loss": 1.0342437028884888,
75
+ "eval_runtime": 23.9046,
76
+ "eval_samples_per_second": 125.499,
77
+ "eval_steps_per_second": 3.932,
78
+ "step": 24
79
+ },
80
+ {
81
+ "epoch": 0.36,
82
+ "eval_loss": 0.9985266923904419,
83
+ "eval_runtime": 23.9037,
84
+ "eval_samples_per_second": 125.504,
85
+ "eval_steps_per_second": 3.932,
86
+ "step": 27
87
+ },
88
+ {
89
+ "epoch": 0.4,
90
+ "eval_loss": 0.9654523134231567,
91
+ "eval_runtime": 23.9129,
92
+ "eval_samples_per_second": 125.455,
93
+ "eval_steps_per_second": 3.931,
94
+ "step": 30
95
+ },
96
+ {
97
+ "epoch": 0.44,
98
+ "eval_loss": 0.939262866973877,
99
+ "eval_runtime": 23.9117,
100
+ "eval_samples_per_second": 125.462,
101
+ "eval_steps_per_second": 3.931,
102
+ "step": 33
103
+ },
104
+ {
105
+ "epoch": 0.48,
106
+ "eval_loss": 0.9186767339706421,
107
+ "eval_runtime": 23.9011,
108
+ "eval_samples_per_second": 125.517,
109
+ "eval_steps_per_second": 3.933,
110
+ "step": 36
111
+ },
112
+ {
113
+ "epoch": 0.51,
114
+ "eval_loss": 0.8969741463661194,
115
+ "eval_runtime": 23.9105,
116
+ "eval_samples_per_second": 125.468,
117
+ "eval_steps_per_second": 3.931,
118
+ "step": 39
119
+ },
120
+ {
121
+ "epoch": 0.53,
122
+ "learning_rate": 0.00019111709286675642,
123
+ "loss": 0.9923,
124
+ "step": 40
125
+ },
126
+ {
127
+ "epoch": 0.55,
128
+ "eval_loss": 0.8814375996589661,
129
+ "eval_runtime": 23.9154,
130
+ "eval_samples_per_second": 125.442,
131
+ "eval_steps_per_second": 3.931,
132
+ "step": 42
133
+ },
134
+ {
135
+ "epoch": 0.59,
136
+ "eval_loss": 0.8654683232307434,
137
+ "eval_runtime": 23.9108,
138
+ "eval_samples_per_second": 125.466,
139
+ "eval_steps_per_second": 3.931,
140
+ "step": 45
141
+ },
142
+ {
143
+ "epoch": 0.63,
144
+ "eval_loss": 0.852226734161377,
145
+ "eval_runtime": 23.9186,
146
+ "eval_samples_per_second": 125.425,
147
+ "eval_steps_per_second": 3.93,
148
+ "step": 48
149
+ },
150
+ {
151
+ "epoch": 0.67,
152
+ "eval_loss": 0.839223325252533,
153
+ "eval_runtime": 23.9074,
154
+ "eval_samples_per_second": 125.484,
155
+ "eval_steps_per_second": 3.932,
156
+ "step": 51
157
+ },
158
+ {
159
+ "epoch": 0.71,
160
+ "eval_loss": 0.8266379237174988,
161
+ "eval_runtime": 23.9399,
162
+ "eval_samples_per_second": 125.314,
163
+ "eval_steps_per_second": 3.926,
164
+ "step": 54
165
+ },
166
+ {
167
+ "epoch": 0.75,
168
+ "eval_loss": 0.8140417337417603,
169
+ "eval_runtime": 23.9355,
170
+ "eval_samples_per_second": 125.337,
171
+ "eval_steps_per_second": 3.927,
172
+ "step": 57
173
+ },
174
+ {
175
+ "epoch": 0.79,
176
+ "learning_rate": 0.0001857335127860027,
177
+ "loss": 0.8611,
178
+ "step": 60
179
+ },
180
+ {
181
+ "epoch": 0.79,
182
+ "eval_loss": 0.8019057512283325,
183
+ "eval_runtime": 23.9223,
184
+ "eval_samples_per_second": 125.406,
185
+ "eval_steps_per_second": 3.929,
186
+ "step": 60
187
+ },
188
+ {
189
+ "epoch": 0.83,
190
+ "eval_loss": 0.7907609343528748,
191
+ "eval_runtime": 23.9384,
192
+ "eval_samples_per_second": 125.322,
193
+ "eval_steps_per_second": 3.927,
194
+ "step": 63
195
+ },
196
+ {
197
+ "epoch": 0.87,
198
+ "eval_loss": 0.7791212797164917,
199
+ "eval_runtime": 23.9101,
200
+ "eval_samples_per_second": 125.47,
201
+ "eval_steps_per_second": 3.931,
202
+ "step": 66
203
+ },
204
+ {
205
+ "epoch": 0.91,
206
+ "eval_loss": 0.7694615125656128,
207
+ "eval_runtime": 23.9079,
208
+ "eval_samples_per_second": 125.481,
209
+ "eval_steps_per_second": 3.932,
210
+ "step": 69
211
+ },
212
+ {
213
+ "epoch": 0.95,
214
+ "eval_loss": 0.7602358460426331,
215
+ "eval_runtime": 23.9116,
216
+ "eval_samples_per_second": 125.462,
217
+ "eval_steps_per_second": 3.931,
218
+ "step": 72
219
+ },
220
+ {
221
+ "epoch": 0.99,
222
+ "eval_loss": 0.753226101398468,
223
+ "eval_runtime": 23.9242,
224
+ "eval_samples_per_second": 125.396,
225
+ "eval_steps_per_second": 3.929,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 1.03,
230
+ "eval_loss": 0.7466432452201843,
231
+ "eval_runtime": 23.9116,
232
+ "eval_samples_per_second": 125.462,
233
+ "eval_steps_per_second": 3.931,
234
+ "step": 78
235
+ },
236
+ {
237
+ "epoch": 1.06,
238
+ "learning_rate": 0.000180349932705249,
239
+ "loss": 0.7843,
240
+ "step": 80
241
+ },
242
+ {
243
+ "epoch": 1.07,
244
+ "eval_loss": 0.7416810989379883,
245
+ "eval_runtime": 23.9171,
246
+ "eval_samples_per_second": 125.433,
247
+ "eval_steps_per_second": 3.93,
248
+ "step": 81
249
+ },
250
+ {
251
+ "epoch": 1.11,
252
+ "eval_loss": 0.7362396121025085,
253
+ "eval_runtime": 23.9079,
254
+ "eval_samples_per_second": 125.481,
255
+ "eval_steps_per_second": 3.932,
256
+ "step": 84
257
+ },
258
+ {
259
+ "epoch": 1.15,
260
+ "eval_loss": 0.7297741174697876,
261
+ "eval_runtime": 23.9084,
262
+ "eval_samples_per_second": 125.479,
263
+ "eval_steps_per_second": 3.932,
264
+ "step": 87
265
+ },
266
+ {
267
+ "epoch": 1.19,
268
+ "eval_loss": 0.7252654433250427,
269
+ "eval_runtime": 23.9206,
270
+ "eval_samples_per_second": 125.415,
271
+ "eval_steps_per_second": 3.93,
272
+ "step": 90
273
+ },
274
+ {
275
+ "epoch": 1.23,
276
+ "eval_loss": 0.7213409543037415,
277
+ "eval_runtime": 23.9179,
278
+ "eval_samples_per_second": 125.429,
279
+ "eval_steps_per_second": 3.93,
280
+ "step": 93
281
+ },
282
+ {
283
+ "epoch": 1.27,
284
+ "eval_loss": 0.7174035906791687,
285
+ "eval_runtime": 23.9354,
286
+ "eval_samples_per_second": 125.337,
287
+ "eval_steps_per_second": 3.927,
288
+ "step": 96
289
+ },
290
+ {
291
+ "epoch": 1.31,
292
+ "eval_loss": 0.7140380144119263,
293
+ "eval_runtime": 23.9214,
294
+ "eval_samples_per_second": 125.411,
295
+ "eval_steps_per_second": 3.93,
296
+ "step": 99
297
+ },
298
+ {
299
+ "epoch": 1.32,
300
+ "learning_rate": 0.0001749663526244953,
301
+ "loss": 0.7301,
302
+ "step": 100
303
+ },
304
+ {
305
+ "epoch": 1.35,
306
+ "eval_loss": 0.7104487419128418,
307
+ "eval_runtime": 23.9093,
308
+ "eval_samples_per_second": 125.474,
309
+ "eval_steps_per_second": 3.932,
310
+ "step": 102
311
+ },
312
+ {
313
+ "epoch": 1.39,
314
+ "eval_loss": 0.7067868113517761,
315
+ "eval_runtime": 23.9129,
316
+ "eval_samples_per_second": 125.455,
317
+ "eval_steps_per_second": 3.931,
318
+ "step": 105
319
+ },
320
+ {
321
+ "epoch": 1.43,
322
+ "eval_loss": 0.7041762471199036,
323
+ "eval_runtime": 23.9161,
324
+ "eval_samples_per_second": 125.439,
325
+ "eval_steps_per_second": 3.93,
326
+ "step": 108
327
+ },
328
+ {
329
+ "epoch": 1.46,
330
+ "eval_loss": 0.7013522982597351,
331
+ "eval_runtime": 23.9133,
332
+ "eval_samples_per_second": 125.453,
333
+ "eval_steps_per_second": 3.931,
334
+ "step": 111
335
+ },
336
+ {
337
+ "epoch": 1.5,
338
+ "eval_loss": 0.6989504098892212,
339
+ "eval_runtime": 23.9152,
340
+ "eval_samples_per_second": 125.443,
341
+ "eval_steps_per_second": 3.931,
342
+ "step": 114
343
+ },
344
+ {
345
+ "epoch": 1.54,
346
+ "eval_loss": 0.6974085569381714,
347
+ "eval_runtime": 23.9561,
348
+ "eval_samples_per_second": 125.229,
349
+ "eval_steps_per_second": 3.924,
350
+ "step": 117
351
+ },
352
+ {
353
+ "epoch": 1.58,
354
+ "learning_rate": 0.0001695827725437416,
355
+ "loss": 0.7141,
356
+ "step": 120
357
+ },
358
+ {
359
+ "epoch": 1.58,
360
+ "eval_loss": 0.6944894194602966,
361
+ "eval_runtime": 23.902,
362
+ "eval_samples_per_second": 125.512,
363
+ "eval_steps_per_second": 3.933,
364
+ "step": 120
365
+ },
366
+ {
367
+ "epoch": 1.62,
368
+ "eval_loss": 0.6929482221603394,
369
+ "eval_runtime": 23.9189,
370
+ "eval_samples_per_second": 125.424,
371
+ "eval_steps_per_second": 3.93,
372
+ "step": 123
373
+ },
374
+ {
375
+ "epoch": 1.66,
376
+ "eval_loss": 0.6903366446495056,
377
+ "eval_runtime": 23.9061,
378
+ "eval_samples_per_second": 125.491,
379
+ "eval_steps_per_second": 3.932,
380
+ "step": 126
381
+ },
382
+ {
383
+ "epoch": 1.7,
384
+ "eval_loss": 0.6882749199867249,
385
+ "eval_runtime": 23.9181,
386
+ "eval_samples_per_second": 125.428,
387
+ "eval_steps_per_second": 3.93,
388
+ "step": 129
389
+ },
390
+ {
391
+ "epoch": 1.74,
392
+ "eval_loss": 0.6863100528717041,
393
+ "eval_runtime": 23.914,
394
+ "eval_samples_per_second": 125.45,
395
+ "eval_steps_per_second": 3.931,
396
+ "step": 132
397
+ },
398
+ {
399
+ "epoch": 1.78,
400
+ "eval_loss": 0.6860549449920654,
401
+ "eval_runtime": 23.9138,
402
+ "eval_samples_per_second": 125.45,
403
+ "eval_steps_per_second": 3.931,
404
+ "step": 135
405
+ },
406
+ {
407
+ "epoch": 1.82,
408
+ "eval_loss": 0.6831715703010559,
409
+ "eval_runtime": 23.9135,
410
+ "eval_samples_per_second": 125.452,
411
+ "eval_steps_per_second": 3.931,
412
+ "step": 138
413
+ },
414
+ {
415
+ "epoch": 1.85,
416
+ "learning_rate": 0.0001641991924629879,
417
+ "loss": 0.6902,
418
+ "step": 140
419
+ },
420
+ {
421
+ "epoch": 1.86,
422
+ "eval_loss": 0.6819499731063843,
423
+ "eval_runtime": 23.8986,
424
+ "eval_samples_per_second": 125.53,
425
+ "eval_steps_per_second": 3.933,
426
+ "step": 141
427
+ },
428
+ {
429
+ "epoch": 1.9,
430
+ "eval_loss": 0.6807693839073181,
431
+ "eval_runtime": 23.9169,
432
+ "eval_samples_per_second": 125.434,
433
+ "eval_steps_per_second": 3.93,
434
+ "step": 144
435
+ },
436
+ {
437
+ "epoch": 1.94,
438
+ "eval_loss": 0.6787669062614441,
439
+ "eval_runtime": 23.9265,
440
+ "eval_samples_per_second": 125.384,
441
+ "eval_steps_per_second": 3.929,
442
+ "step": 147
443
+ },
444
+ {
445
+ "epoch": 1.98,
446
+ "eval_loss": 0.6773442625999451,
447
+ "eval_runtime": 23.9274,
448
+ "eval_samples_per_second": 125.38,
449
+ "eval_steps_per_second": 3.929,
450
+ "step": 150
451
+ },
452
+ {
453
+ "epoch": 2.02,
454
+ "eval_loss": 0.6759281158447266,
455
+ "eval_runtime": 23.9386,
456
+ "eval_samples_per_second": 125.321,
457
+ "eval_steps_per_second": 3.927,
458
+ "step": 153
459
+ },
460
+ {
461
+ "epoch": 2.06,
462
+ "eval_loss": 0.6743582487106323,
463
+ "eval_runtime": 23.9323,
464
+ "eval_samples_per_second": 125.354,
465
+ "eval_steps_per_second": 3.928,
466
+ "step": 156
467
+ },
468
+ {
469
+ "epoch": 2.1,
470
+ "eval_loss": 0.6732926368713379,
471
+ "eval_runtime": 23.9145,
472
+ "eval_samples_per_second": 125.447,
473
+ "eval_steps_per_second": 3.931,
474
+ "step": 159
475
+ },
476
+ {
477
+ "epoch": 2.11,
478
+ "learning_rate": 0.0001588156123822342,
479
+ "loss": 0.6766,
480
+ "step": 160
481
+ },
482
+ {
483
+ "epoch": 2.14,
484
+ "eval_loss": 0.6721953749656677,
485
+ "eval_runtime": 23.9073,
486
+ "eval_samples_per_second": 125.485,
487
+ "eval_steps_per_second": 3.932,
488
+ "step": 162
489
+ },
490
+ {
491
+ "epoch": 2.18,
492
+ "eval_loss": 0.6714429259300232,
493
+ "eval_runtime": 23.8955,
494
+ "eval_samples_per_second": 125.547,
495
+ "eval_steps_per_second": 3.934,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 2.22,
500
+ "eval_loss": 0.670035183429718,
501
+ "eval_runtime": 23.9431,
502
+ "eval_samples_per_second": 125.297,
503
+ "eval_steps_per_second": 3.926,
504
+ "step": 168
505
+ },
506
+ {
507
+ "epoch": 2.26,
508
+ "eval_loss": 0.6695354580879211,
509
+ "eval_runtime": 23.8875,
510
+ "eval_samples_per_second": 125.589,
511
+ "eval_steps_per_second": 3.935,
512
+ "step": 171
513
+ },
514
+ {
515
+ "epoch": 2.3,
516
+ "eval_loss": 0.6689226031303406,
517
+ "eval_runtime": 23.9185,
518
+ "eval_samples_per_second": 125.426,
519
+ "eval_steps_per_second": 3.93,
520
+ "step": 174
521
+ },
522
+ {
523
+ "epoch": 2.34,
524
+ "eval_loss": 0.6674054861068726,
525
+ "eval_runtime": 23.941,
526
+ "eval_samples_per_second": 125.308,
527
+ "eval_steps_per_second": 3.926,
528
+ "step": 177
529
+ },
530
+ {
531
+ "epoch": 2.38,
532
+ "learning_rate": 0.00015343203230148048,
533
+ "loss": 0.6743,
534
+ "step": 180
535
+ },
536
+ {
537
+ "epoch": 2.38,
538
+ "eval_loss": 0.6664847731590271,
539
+ "eval_runtime": 23.9211,
540
+ "eval_samples_per_second": 125.412,
541
+ "eval_steps_per_second": 3.93,
542
+ "step": 180
543
+ },
544
+ {
545
+ "epoch": 2.41,
546
+ "eval_loss": 0.6658627986907959,
547
+ "eval_runtime": 23.9247,
548
+ "eval_samples_per_second": 125.394,
549
+ "eval_steps_per_second": 3.929,
550
+ "step": 183
551
+ },
552
+ {
553
+ "epoch": 2.45,
554
+ "eval_loss": 0.664908766746521,
555
+ "eval_runtime": 23.9272,
556
+ "eval_samples_per_second": 125.38,
557
+ "eval_steps_per_second": 3.929,
558
+ "step": 186
559
+ },
560
+ {
561
+ "epoch": 2.49,
562
+ "eval_loss": 0.6638036966323853,
563
+ "eval_runtime": 23.9187,
564
+ "eval_samples_per_second": 125.425,
565
+ "eval_steps_per_second": 3.93,
566
+ "step": 189
567
+ },
568
+ {
569
+ "epoch": 2.53,
570
+ "eval_loss": 0.6625837683677673,
571
+ "eval_runtime": 23.9033,
572
+ "eval_samples_per_second": 125.506,
573
+ "eval_steps_per_second": 3.933,
574
+ "step": 192
575
+ },
576
+ {
577
+ "epoch": 2.57,
578
+ "eval_loss": 0.6619511842727661,
579
+ "eval_runtime": 23.8973,
580
+ "eval_samples_per_second": 125.537,
581
+ "eval_steps_per_second": 3.934,
582
+ "step": 195
583
+ },
584
+ {
585
+ "epoch": 2.61,
586
+ "eval_loss": 0.6611769199371338,
587
+ "eval_runtime": 23.9129,
588
+ "eval_samples_per_second": 125.455,
589
+ "eval_steps_per_second": 3.931,
590
+ "step": 198
591
+ },
592
+ {
593
+ "epoch": 2.64,
594
+ "learning_rate": 0.00014804845222072678,
595
+ "loss": 0.6615,
596
+ "step": 200
597
+ },
598
+ {
599
+ "epoch": 2.65,
600
+ "eval_loss": 0.6606143116950989,
601
+ "eval_runtime": 23.9126,
602
+ "eval_samples_per_second": 125.457,
603
+ "eval_steps_per_second": 3.931,
604
+ "step": 201
605
+ },
606
+ {
607
+ "epoch": 2.69,
608
+ "eval_loss": 0.6589743494987488,
609
+ "eval_runtime": 23.9135,
610
+ "eval_samples_per_second": 125.452,
611
+ "eval_steps_per_second": 3.931,
612
+ "step": 204
613
+ },
614
+ {
615
+ "epoch": 2.73,
616
+ "eval_loss": 0.6578481197357178,
617
+ "eval_runtime": 23.9217,
618
+ "eval_samples_per_second": 125.409,
619
+ "eval_steps_per_second": 3.929,
620
+ "step": 207
621
+ },
622
+ {
623
+ "epoch": 2.77,
624
+ "eval_loss": 0.6571096181869507,
625
+ "eval_runtime": 23.9415,
626
+ "eval_samples_per_second": 125.305,
627
+ "eval_steps_per_second": 3.926,
628
+ "step": 210
629
+ },
630
+ {
631
+ "epoch": 2.81,
632
+ "eval_loss": 0.656689465045929,
633
+ "eval_runtime": 23.9111,
634
+ "eval_samples_per_second": 125.465,
635
+ "eval_steps_per_second": 3.931,
636
+ "step": 213
637
+ },
638
+ {
639
+ "epoch": 2.85,
640
+ "eval_loss": 0.6556207537651062,
641
+ "eval_runtime": 23.9099,
642
+ "eval_samples_per_second": 125.471,
643
+ "eval_steps_per_second": 3.931,
644
+ "step": 216
645
+ },
646
+ {
647
+ "epoch": 2.89,
648
+ "eval_loss": 0.6546627283096313,
649
+ "eval_runtime": 23.9164,
650
+ "eval_samples_per_second": 125.437,
651
+ "eval_steps_per_second": 3.93,
652
+ "step": 219
653
+ },
654
+ {
655
+ "epoch": 2.9,
656
+ "learning_rate": 0.0001426648721399731,
657
+ "loss": 0.6564,
658
+ "step": 220
659
+ },
660
+ {
661
+ "epoch": 2.93,
662
+ "eval_loss": 0.6539400815963745,
663
+ "eval_runtime": 23.906,
664
+ "eval_samples_per_second": 125.492,
665
+ "eval_steps_per_second": 3.932,
666
+ "step": 222
667
+ },
668
+ {
669
+ "epoch": 2.97,
670
+ "eval_loss": 0.653684675693512,
671
+ "eval_runtime": 23.9251,
672
+ "eval_samples_per_second": 125.391,
673
+ "eval_steps_per_second": 3.929,
674
+ "step": 225
675
+ },
676
+ {
677
+ "epoch": 3.01,
678
+ "eval_loss": 0.6526629328727722,
679
+ "eval_runtime": 23.9289,
680
+ "eval_samples_per_second": 125.371,
681
+ "eval_steps_per_second": 3.928,
682
+ "step": 228
683
+ },
684
+ {
685
+ "epoch": 3.05,
686
+ "eval_loss": 0.6525079011917114,
687
+ "eval_runtime": 23.9193,
688
+ "eval_samples_per_second": 125.421,
689
+ "eval_steps_per_second": 3.93,
690
+ "step": 231
691
+ },
692
+ {
693
+ "epoch": 3.09,
694
+ "eval_loss": 0.6514959931373596,
695
+ "eval_runtime": 23.9574,
696
+ "eval_samples_per_second": 125.223,
697
+ "eval_steps_per_second": 3.924,
698
+ "step": 234
699
+ },
700
+ {
701
+ "epoch": 3.13,
702
+ "eval_loss": 0.6507047414779663,
703
+ "eval_runtime": 23.9234,
704
+ "eval_samples_per_second": 125.4,
705
+ "eval_steps_per_second": 3.929,
706
+ "step": 237
707
+ },
708
+ {
709
+ "epoch": 3.17,
710
+ "learning_rate": 0.00013728129205921937,
711
+ "loss": 0.6469,
712
+ "step": 240
713
+ },
714
+ {
715
+ "epoch": 3.17,
716
+ "eval_loss": 0.6504186391830444,
717
+ "eval_runtime": 23.937,
718
+ "eval_samples_per_second": 125.329,
719
+ "eval_steps_per_second": 3.927,
720
+ "step": 240
721
+ },
722
+ {
723
+ "epoch": 3.21,
724
+ "eval_loss": 0.6495808959007263,
725
+ "eval_runtime": 23.9188,
726
+ "eval_samples_per_second": 125.425,
727
+ "eval_steps_per_second": 3.93,
728
+ "step": 243
729
+ },
730
+ {
731
+ "epoch": 3.25,
732
+ "eval_loss": 0.649512529373169,
733
+ "eval_runtime": 23.9209,
734
+ "eval_samples_per_second": 125.413,
735
+ "eval_steps_per_second": 3.93,
736
+ "step": 246
737
+ },
738
+ {
739
+ "epoch": 3.29,
740
+ "eval_loss": 0.648629903793335,
741
+ "eval_runtime": 23.9137,
742
+ "eval_samples_per_second": 125.451,
743
+ "eval_steps_per_second": 3.931,
744
+ "step": 249
745
+ },
746
+ {
747
+ "epoch": 3.33,
748
+ "eval_loss": 0.6480894088745117,
749
+ "eval_runtime": 23.919,
750
+ "eval_samples_per_second": 125.423,
751
+ "eval_steps_per_second": 3.93,
752
+ "step": 252
753
+ },
754
+ {
755
+ "epoch": 3.36,
756
+ "eval_loss": 0.6474400758743286,
757
+ "eval_runtime": 23.9076,
758
+ "eval_samples_per_second": 125.483,
759
+ "eval_steps_per_second": 3.932,
760
+ "step": 255
761
+ },
762
+ {
763
+ "epoch": 3.4,
764
+ "eval_loss": 0.6468291878700256,
765
+ "eval_runtime": 23.9305,
766
+ "eval_samples_per_second": 125.363,
767
+ "eval_steps_per_second": 3.928,
768
+ "step": 258
769
+ },
770
+ {
771
+ "epoch": 3.43,
772
+ "learning_rate": 0.00013189771197846567,
773
+ "loss": 0.6463,
774
+ "step": 260
775
+ },
776
+ {
777
+ "epoch": 3.44,
778
+ "eval_loss": 0.6462663412094116,
779
+ "eval_runtime": 23.9359,
780
+ "eval_samples_per_second": 125.335,
781
+ "eval_steps_per_second": 3.927,
782
+ "step": 261
783
+ },
784
+ {
785
+ "epoch": 3.48,
786
+ "eval_loss": 0.6458565592765808,
787
+ "eval_runtime": 23.929,
788
+ "eval_samples_per_second": 125.371,
789
+ "eval_steps_per_second": 3.928,
790
+ "step": 264
791
+ },
792
+ {
793
+ "epoch": 3.52,
794
+ "eval_loss": 0.645412266254425,
795
+ "eval_runtime": 23.9362,
796
+ "eval_samples_per_second": 125.333,
797
+ "eval_steps_per_second": 3.927,
798
+ "step": 267
799
+ },
800
+ {
801
+ "epoch": 3.56,
802
+ "eval_loss": 0.6449554562568665,
803
+ "eval_runtime": 23.9004,
804
+ "eval_samples_per_second": 125.521,
805
+ "eval_steps_per_second": 3.933,
806
+ "step": 270
807
+ },
808
+ {
809
+ "epoch": 3.6,
810
+ "eval_loss": 0.6443325281143188,
811
+ "eval_runtime": 23.9065,
812
+ "eval_samples_per_second": 125.489,
813
+ "eval_steps_per_second": 3.932,
814
+ "step": 273
815
+ },
816
+ {
817
+ "epoch": 3.64,
818
+ "eval_loss": 0.6435034871101379,
819
+ "eval_runtime": 23.9072,
820
+ "eval_samples_per_second": 125.485,
821
+ "eval_steps_per_second": 3.932,
822
+ "step": 276
823
+ },
824
+ {
825
+ "epoch": 3.68,
826
+ "eval_loss": 0.6433733701705933,
827
+ "eval_runtime": 23.9042,
828
+ "eval_samples_per_second": 125.501,
829
+ "eval_steps_per_second": 3.932,
830
+ "step": 279
831
+ },
832
+ {
833
+ "epoch": 3.69,
834
+ "learning_rate": 0.00012651413189771198,
835
+ "loss": 0.6389,
836
+ "step": 280
837
+ },
838
+ {
839
+ "epoch": 3.72,
840
+ "eval_loss": 0.6425070762634277,
841
+ "eval_runtime": 23.8874,
842
+ "eval_samples_per_second": 125.589,
843
+ "eval_steps_per_second": 3.935,
844
+ "step": 282
845
+ },
846
+ {
847
+ "epoch": 3.76,
848
+ "eval_loss": 0.642119288444519,
849
+ "eval_runtime": 23.9328,
850
+ "eval_samples_per_second": 125.351,
851
+ "eval_steps_per_second": 3.928,
852
+ "step": 285
853
+ },
854
+ {
855
+ "epoch": 3.8,
856
+ "eval_loss": 0.641748309135437,
857
+ "eval_runtime": 23.9294,
858
+ "eval_samples_per_second": 125.369,
859
+ "eval_steps_per_second": 3.928,
860
+ "step": 288
861
+ },
862
+ {
863
+ "epoch": 3.84,
864
+ "eval_loss": 0.640826404094696,
865
+ "eval_runtime": 23.9434,
866
+ "eval_samples_per_second": 125.296,
867
+ "eval_steps_per_second": 3.926,
868
+ "step": 291
869
+ },
870
+ {
871
+ "epoch": 3.88,
872
+ "eval_loss": 0.6402388215065002,
873
+ "eval_runtime": 23.9162,
874
+ "eval_samples_per_second": 125.438,
875
+ "eval_steps_per_second": 3.93,
876
+ "step": 294
877
+ },
878
+ {
879
+ "epoch": 3.92,
880
+ "eval_loss": 0.6407353281974792,
881
+ "eval_runtime": 23.9121,
882
+ "eval_samples_per_second": 125.46,
883
+ "eval_steps_per_second": 3.931,
884
+ "step": 297
885
+ },
886
+ {
887
+ "epoch": 3.96,
888
+ "learning_rate": 0.0001211305518169583,
889
+ "loss": 0.6318,
890
+ "step": 300
891
+ },
892
+ {
893
+ "epoch": 3.96,
894
+ "eval_loss": 0.6398600935935974,
895
+ "eval_runtime": 23.9229,
896
+ "eval_samples_per_second": 125.403,
897
+ "eval_steps_per_second": 3.929,
898
+ "step": 300
899
+ },
900
+ {
901
+ "epoch": 4.0,
902
+ "eval_loss": 0.6393464207649231,
903
+ "eval_runtime": 23.9187,
904
+ "eval_samples_per_second": 125.425,
905
+ "eval_steps_per_second": 3.93,
906
+ "step": 303
907
+ },
908
+ {
909
+ "epoch": 4.04,
910
+ "eval_loss": 0.6392526626586914,
911
+ "eval_runtime": 23.9074,
912
+ "eval_samples_per_second": 125.484,
913
+ "eval_steps_per_second": 3.932,
914
+ "step": 306
915
+ },
916
+ {
917
+ "epoch": 4.08,
918
+ "eval_loss": 0.6389594078063965,
919
+ "eval_runtime": 23.918,
920
+ "eval_samples_per_second": 125.428,
921
+ "eval_steps_per_second": 3.93,
922
+ "step": 309
923
+ },
924
+ {
925
+ "epoch": 4.12,
926
+ "eval_loss": 0.6388808488845825,
927
+ "eval_runtime": 23.9158,
928
+ "eval_samples_per_second": 125.44,
929
+ "eval_steps_per_second": 3.93,
930
+ "step": 312
931
+ },
932
+ {
933
+ "epoch": 4.16,
934
+ "eval_loss": 0.6384025812149048,
935
+ "eval_runtime": 23.9176,
936
+ "eval_samples_per_second": 125.431,
937
+ "eval_steps_per_second": 3.93,
938
+ "step": 315
939
+ },
940
+ {
941
+ "epoch": 4.2,
942
+ "eval_loss": 0.6387144923210144,
943
+ "eval_runtime": 23.9047,
944
+ "eval_samples_per_second": 125.498,
945
+ "eval_steps_per_second": 3.932,
946
+ "step": 318
947
+ },
948
+ {
949
+ "epoch": 4.22,
950
+ "learning_rate": 0.00011574697173620459,
951
+ "loss": 0.6277,
952
+ "step": 320
953
+ },
954
+ {
955
+ "epoch": 4.24,
956
+ "eval_loss": 0.6377059817314148,
957
+ "eval_runtime": 23.9246,
958
+ "eval_samples_per_second": 125.394,
959
+ "eval_steps_per_second": 3.929,
960
+ "step": 321
961
+ },
962
+ {
963
+ "epoch": 4.28,
964
+ "eval_loss": 0.636981189250946,
965
+ "eval_runtime": 23.9459,
966
+ "eval_samples_per_second": 125.283,
967
+ "eval_steps_per_second": 3.926,
968
+ "step": 324
969
+ },
970
+ {
971
+ "epoch": 4.32,
972
+ "eval_loss": 0.6364036202430725,
973
+ "eval_runtime": 23.9206,
974
+ "eval_samples_per_second": 125.415,
975
+ "eval_steps_per_second": 3.93,
976
+ "step": 327
977
+ },
978
+ {
979
+ "epoch": 4.35,
980
+ "eval_loss": 0.6357031464576721,
981
+ "eval_runtime": 23.9187,
982
+ "eval_samples_per_second": 125.425,
983
+ "eval_steps_per_second": 3.93,
984
+ "step": 330
985
+ },
986
+ {
987
+ "epoch": 4.39,
988
+ "eval_loss": 0.6366411447525024,
989
+ "eval_runtime": 23.9159,
990
+ "eval_samples_per_second": 125.44,
991
+ "eval_steps_per_second": 3.93,
992
+ "step": 333
993
+ },
994
+ {
995
+ "epoch": 4.43,
996
+ "eval_loss": 0.6357526183128357,
997
+ "eval_runtime": 23.9135,
998
+ "eval_samples_per_second": 125.452,
999
+ "eval_steps_per_second": 3.931,
1000
+ "step": 336
1001
+ },
1002
+ {
1003
+ "epoch": 4.47,
1004
+ "eval_loss": 0.6349912881851196,
1005
+ "eval_runtime": 23.9211,
1006
+ "eval_samples_per_second": 125.412,
1007
+ "eval_steps_per_second": 3.93,
1008
+ "step": 339
1009
+ },
1010
+ {
1011
+ "epoch": 4.49,
1012
+ "learning_rate": 0.00011036339165545088,
1013
+ "loss": 0.6303,
1014
+ "step": 340
1015
+ },
1016
+ {
1017
+ "epoch": 4.51,
1018
+ "eval_loss": 0.6343324184417725,
1019
+ "eval_runtime": 23.927,
1020
+ "eval_samples_per_second": 125.381,
1021
+ "eval_steps_per_second": 3.929,
1022
+ "step": 342
1023
+ },
1024
+ {
1025
+ "epoch": 4.55,
1026
+ "eval_loss": 0.6347218751907349,
1027
+ "eval_runtime": 23.9489,
1028
+ "eval_samples_per_second": 125.267,
1029
+ "eval_steps_per_second": 3.925,
1030
+ "step": 345
1031
+ },
1032
+ {
1033
+ "epoch": 4.59,
1034
+ "eval_loss": 0.6333290338516235,
1035
+ "eval_runtime": 23.9573,
1036
+ "eval_samples_per_second": 125.223,
1037
+ "eval_steps_per_second": 3.924,
1038
+ "step": 348
1039
+ },
1040
+ {
1041
+ "epoch": 4.63,
1042
+ "eval_loss": 0.6328045129776001,
1043
+ "eval_runtime": 23.925,
1044
+ "eval_samples_per_second": 125.392,
1045
+ "eval_steps_per_second": 3.929,
1046
+ "step": 351
1047
+ },
1048
+ {
1049
+ "epoch": 4.67,
1050
+ "eval_loss": 0.6328830718994141,
1051
+ "eval_runtime": 23.9277,
1052
+ "eval_samples_per_second": 125.378,
1053
+ "eval_steps_per_second": 3.928,
1054
+ "step": 354
1055
+ },
1056
+ {
1057
+ "epoch": 4.71,
1058
+ "eval_loss": 0.6323109269142151,
1059
+ "eval_runtime": 23.9385,
1060
+ "eval_samples_per_second": 125.321,
1061
+ "eval_steps_per_second": 3.927,
1062
+ "step": 357
1063
+ },
1064
+ {
1065
+ "epoch": 4.75,
1066
+ "learning_rate": 0.00010497981157469719,
1067
+ "loss": 0.6268,
1068
+ "step": 360
1069
+ },
1070
+ {
1071
+ "epoch": 4.75,
1072
+ "eval_loss": 0.6327587366104126,
1073
+ "eval_runtime": 23.9389,
1074
+ "eval_samples_per_second": 125.319,
1075
+ "eval_steps_per_second": 3.927,
1076
+ "step": 360
1077
+ },
1078
+ {
1079
+ "epoch": 4.79,
1080
+ "eval_loss": 0.6324266791343689,
1081
+ "eval_runtime": 23.9367,
1082
+ "eval_samples_per_second": 125.331,
1083
+ "eval_steps_per_second": 3.927,
1084
+ "step": 363
1085
+ },
1086
+ {
1087
+ "epoch": 4.83,
1088
+ "eval_loss": 0.6320524215698242,
1089
+ "eval_runtime": 23.9373,
1090
+ "eval_samples_per_second": 125.327,
1091
+ "eval_steps_per_second": 3.927,
1092
+ "step": 366
1093
+ },
1094
+ {
1095
+ "epoch": 4.87,
1096
+ "eval_loss": 0.6314539313316345,
1097
+ "eval_runtime": 23.9325,
1098
+ "eval_samples_per_second": 125.352,
1099
+ "eval_steps_per_second": 3.928,
1100
+ "step": 369
1101
+ },
1102
+ {
1103
+ "epoch": 4.91,
1104
+ "eval_loss": 0.6318089365959167,
1105
+ "eval_runtime": 23.9345,
1106
+ "eval_samples_per_second": 125.342,
1107
+ "eval_steps_per_second": 3.927,
1108
+ "step": 372
1109
+ },
1110
+ {
1111
+ "epoch": 4.95,
1112
+ "eval_loss": 0.6315808296203613,
1113
+ "eval_runtime": 23.924,
1114
+ "eval_samples_per_second": 125.397,
1115
+ "eval_steps_per_second": 3.929,
1116
+ "step": 375
1117
+ },
1118
+ {
1119
+ "epoch": 4.99,
1120
+ "eval_loss": 0.630818247795105,
1121
+ "eval_runtime": 23.9285,
1122
+ "eval_samples_per_second": 125.373,
1123
+ "eval_steps_per_second": 3.928,
1124
+ "step": 378
1125
+ },
1126
+ {
1127
+ "epoch": 5.01,
1128
+ "learning_rate": 9.959623149394348e-05,
1129
+ "loss": 0.6196,
1130
+ "step": 380
1131
+ },
1132
+ {
1133
+ "epoch": 5.03,
1134
+ "eval_loss": 0.630248486995697,
1135
+ "eval_runtime": 23.9231,
1136
+ "eval_samples_per_second": 125.402,
1137
+ "eval_steps_per_second": 3.929,
1138
+ "step": 381
1139
+ },
1140
+ {
1141
+ "epoch": 5.07,
1142
+ "eval_loss": 0.6306143403053284,
1143
+ "eval_runtime": 23.9242,
1144
+ "eval_samples_per_second": 125.396,
1145
+ "eval_steps_per_second": 3.929,
1146
+ "step": 384
1147
+ },
1148
+ {
1149
+ "epoch": 5.11,
1150
+ "eval_loss": 0.6305729746818542,
1151
+ "eval_runtime": 23.9232,
1152
+ "eval_samples_per_second": 125.401,
1153
+ "eval_steps_per_second": 3.929,
1154
+ "step": 387
1155
+ },
1156
+ {
1157
+ "epoch": 5.15,
1158
+ "eval_loss": 0.6302648782730103,
1159
+ "eval_runtime": 23.9286,
1160
+ "eval_samples_per_second": 125.373,
1161
+ "eval_steps_per_second": 3.928,
1162
+ "step": 390
1163
+ },
1164
+ {
1165
+ "epoch": 5.19,
1166
+ "eval_loss": 0.6298710703849792,
1167
+ "eval_runtime": 23.9258,
1168
+ "eval_samples_per_second": 125.388,
1169
+ "eval_steps_per_second": 3.929,
1170
+ "step": 393
1171
+ },
1172
+ {
1173
+ "epoch": 5.23,
1174
+ "eval_loss": 0.6298263669013977,
1175
+ "eval_runtime": 23.9284,
1176
+ "eval_samples_per_second": 125.374,
1177
+ "eval_steps_per_second": 3.928,
1178
+ "step": 396
1179
+ },
1180
+ {
1181
+ "epoch": 5.27,
1182
+ "eval_loss": 0.6292470097541809,
1183
+ "eval_runtime": 23.9269,
1184
+ "eval_samples_per_second": 125.382,
1185
+ "eval_steps_per_second": 3.929,
1186
+ "step": 399
1187
+ },
1188
+ {
1189
+ "epoch": 5.28,
1190
+ "learning_rate": 9.421265141318977e-05,
1191
+ "loss": 0.6146,
1192
+ "step": 400
1193
+ },
1194
+ {
1195
+ "epoch": 5.3,
1196
+ "eval_loss": 0.6291049122810364,
1197
+ "eval_runtime": 23.9297,
1198
+ "eval_samples_per_second": 125.367,
1199
+ "eval_steps_per_second": 3.928,
1200
+ "step": 402
1201
+ },
1202
+ {
1203
+ "epoch": 5.34,
1204
+ "eval_loss": 0.6296722292900085,
1205
+ "eval_runtime": 23.9386,
1206
+ "eval_samples_per_second": 125.321,
1207
+ "eval_steps_per_second": 3.927,
1208
+ "step": 405
1209
+ },
1210
+ {
1211
+ "epoch": 5.38,
1212
+ "eval_loss": 0.6288275122642517,
1213
+ "eval_runtime": 23.9308,
1214
+ "eval_samples_per_second": 125.362,
1215
+ "eval_steps_per_second": 3.928,
1216
+ "step": 408
1217
+ },
1218
+ {
1219
+ "epoch": 5.42,
1220
+ "eval_loss": 0.6288333535194397,
1221
+ "eval_runtime": 23.9261,
1222
+ "eval_samples_per_second": 125.386,
1223
+ "eval_steps_per_second": 3.929,
1224
+ "step": 411
1225
+ },
1226
+ {
1227
+ "epoch": 5.46,
1228
+ "eval_loss": 0.6279690861701965,
1229
+ "eval_runtime": 23.9282,
1230
+ "eval_samples_per_second": 125.375,
1231
+ "eval_steps_per_second": 3.928,
1232
+ "step": 414
1233
+ },
1234
+ {
1235
+ "epoch": 5.5,
1236
+ "eval_loss": 0.6275332570075989,
1237
+ "eval_runtime": 23.9215,
1238
+ "eval_samples_per_second": 125.41,
1239
+ "eval_steps_per_second": 3.93,
1240
+ "step": 417
1241
+ },
1242
+ {
1243
+ "epoch": 5.54,
1244
+ "learning_rate": 8.882907133243608e-05,
1245
+ "loss": 0.6149,
1246
+ "step": 420
1247
+ },
1248
+ {
1249
+ "epoch": 5.54,
1250
+ "eval_loss": 0.6279338598251343,
1251
+ "eval_runtime": 23.93,
1252
+ "eval_samples_per_second": 125.366,
1253
+ "eval_steps_per_second": 3.928,
1254
+ "step": 420
1255
+ },
1256
+ {
1257
+ "epoch": 5.58,
1258
+ "eval_loss": 0.6271057724952698,
1259
+ "eval_runtime": 23.9158,
1260
+ "eval_samples_per_second": 125.44,
1261
+ "eval_steps_per_second": 3.93,
1262
+ "step": 423
1263
+ },
1264
+ {
1265
+ "epoch": 5.62,
1266
+ "eval_loss": 0.6270298361778259,
1267
+ "eval_runtime": 23.9264,
1268
+ "eval_samples_per_second": 125.384,
1269
+ "eval_steps_per_second": 3.929,
1270
+ "step": 426
1271
+ },
1272
+ {
1273
+ "epoch": 5.66,
1274
+ "eval_loss": 0.6271407604217529,
1275
+ "eval_runtime": 23.9362,
1276
+ "eval_samples_per_second": 125.333,
1277
+ "eval_steps_per_second": 3.927,
1278
+ "step": 429
1279
+ },
1280
+ {
1281
+ "epoch": 5.7,
1282
+ "eval_loss": 0.6264240145683289,
1283
+ "eval_runtime": 23.9309,
1284
+ "eval_samples_per_second": 125.361,
1285
+ "eval_steps_per_second": 3.928,
1286
+ "step": 432
1287
+ },
1288
+ {
1289
+ "epoch": 5.74,
1290
+ "eval_loss": 0.6263339519500732,
1291
+ "eval_runtime": 23.93,
1292
+ "eval_samples_per_second": 125.366,
1293
+ "eval_steps_per_second": 3.928,
1294
+ "step": 435
1295
+ },
1296
+ {
1297
+ "epoch": 5.78,
1298
+ "eval_loss": 0.6256468296051025,
1299
+ "eval_runtime": 23.9252,
1300
+ "eval_samples_per_second": 125.391,
1301
+ "eval_steps_per_second": 3.929,
1302
+ "step": 438
1303
+ },
1304
+ {
1305
+ "epoch": 5.81,
1306
+ "learning_rate": 8.344549125168237e-05,
1307
+ "loss": 0.6191,
1308
+ "step": 440
1309
+ },
1310
+ {
1311
+ "epoch": 5.82,
1312
+ "eval_loss": 0.6260586977005005,
1313
+ "eval_runtime": 23.918,
1314
+ "eval_samples_per_second": 125.429,
1315
+ "eval_steps_per_second": 3.93,
1316
+ "step": 441
1317
+ },
1318
+ {
1319
+ "epoch": 5.86,
1320
+ "eval_loss": 0.625337541103363,
1321
+ "eval_runtime": 23.9172,
1322
+ "eval_samples_per_second": 125.433,
1323
+ "eval_steps_per_second": 3.93,
1324
+ "step": 444
1325
+ },
1326
+ {
1327
+ "epoch": 5.9,
1328
+ "eval_loss": 0.6246620416641235,
1329
+ "eval_runtime": 23.908,
1330
+ "eval_samples_per_second": 125.481,
1331
+ "eval_steps_per_second": 3.932,
1332
+ "step": 447
1333
+ },
1334
+ {
1335
+ "epoch": 5.94,
1336
+ "eval_loss": 0.6251673698425293,
1337
+ "eval_runtime": 23.9137,
1338
+ "eval_samples_per_second": 125.451,
1339
+ "eval_steps_per_second": 3.931,
1340
+ "step": 450
1341
+ },
1342
+ {
1343
+ "epoch": 5.98,
1344
+ "eval_loss": 0.6253092288970947,
1345
+ "eval_runtime": 23.9409,
1346
+ "eval_samples_per_second": 125.309,
1347
+ "eval_steps_per_second": 3.926,
1348
+ "step": 453
1349
+ },
1350
+ {
1351
+ "epoch": 6.02,
1352
+ "eval_loss": 0.6245599389076233,
1353
+ "eval_runtime": 23.9233,
1354
+ "eval_samples_per_second": 125.401,
1355
+ "eval_steps_per_second": 3.929,
1356
+ "step": 456
1357
+ },
1358
+ {
1359
+ "epoch": 6.06,
1360
+ "eval_loss": 0.6247097849845886,
1361
+ "eval_runtime": 23.9184,
1362
+ "eval_samples_per_second": 125.426,
1363
+ "eval_steps_per_second": 3.93,
1364
+ "step": 459
1365
+ },
1366
+ {
1367
+ "epoch": 6.07,
1368
+ "learning_rate": 7.806191117092868e-05,
1369
+ "loss": 0.6075,
1370
+ "step": 460
1371
+ },
1372
+ {
1373
+ "epoch": 6.1,
1374
+ "eval_loss": 0.6237714290618896,
1375
+ "eval_runtime": 23.9137,
1376
+ "eval_samples_per_second": 125.451,
1377
+ "eval_steps_per_second": 3.931,
1378
+ "step": 462
1379
+ },
1380
+ {
1381
+ "epoch": 6.14,
1382
+ "eval_loss": 0.6239632964134216,
1383
+ "eval_runtime": 23.9142,
1384
+ "eval_samples_per_second": 125.449,
1385
+ "eval_steps_per_second": 3.931,
1386
+ "step": 465
1387
+ },
1388
+ {
1389
+ "epoch": 6.18,
1390
+ "eval_loss": 0.6246253252029419,
1391
+ "eval_runtime": 23.9199,
1392
+ "eval_samples_per_second": 125.419,
1393
+ "eval_steps_per_second": 3.93,
1394
+ "step": 468
1395
+ },
1396
+ {
1397
+ "epoch": 6.22,
1398
+ "eval_loss": 0.6236398220062256,
1399
+ "eval_runtime": 23.9169,
1400
+ "eval_samples_per_second": 125.434,
1401
+ "eval_steps_per_second": 3.93,
1402
+ "step": 471
1403
+ },
1404
+ {
1405
+ "epoch": 6.25,
1406
+ "eval_loss": 0.6242309808731079,
1407
+ "eval_runtime": 23.9091,
1408
+ "eval_samples_per_second": 125.475,
1409
+ "eval_steps_per_second": 3.932,
1410
+ "step": 474
1411
+ },
1412
+ {
1413
+ "epoch": 6.29,
1414
+ "eval_loss": 0.6236902475357056,
1415
+ "eval_runtime": 23.9117,
1416
+ "eval_samples_per_second": 125.462,
1417
+ "eval_steps_per_second": 3.931,
1418
+ "step": 477
1419
+ },
1420
+ {
1421
+ "epoch": 6.33,
1422
+ "learning_rate": 7.267833109017497e-05,
1423
+ "loss": 0.6061,
1424
+ "step": 480
1425
+ },
1426
+ {
1427
+ "epoch": 6.33,
1428
+ "eval_loss": 0.623267650604248,
1429
+ "eval_runtime": 23.9071,
1430
+ "eval_samples_per_second": 125.485,
1431
+ "eval_steps_per_second": 3.932,
1432
+ "step": 480
1433
+ },
1434
+ {
1435
+ "epoch": 6.37,
1436
+ "eval_loss": 0.6238719820976257,
1437
+ "eval_runtime": 23.9206,
1438
+ "eval_samples_per_second": 125.415,
1439
+ "eval_steps_per_second": 3.93,
1440
+ "step": 483
1441
+ },
1442
+ {
1443
+ "epoch": 6.41,
1444
+ "eval_loss": 0.6234752535820007,
1445
+ "eval_runtime": 23.914,
1446
+ "eval_samples_per_second": 125.449,
1447
+ "eval_steps_per_second": 3.931,
1448
+ "step": 486
1449
+ },
1450
+ {
1451
+ "epoch": 6.45,
1452
+ "eval_loss": 0.6228368878364563,
1453
+ "eval_runtime": 23.9087,
1454
+ "eval_samples_per_second": 125.477,
1455
+ "eval_steps_per_second": 3.932,
1456
+ "step": 489
1457
+ },
1458
+ {
1459
+ "epoch": 6.49,
1460
+ "eval_loss": 0.6226744055747986,
1461
+ "eval_runtime": 23.9118,
1462
+ "eval_samples_per_second": 125.461,
1463
+ "eval_steps_per_second": 3.931,
1464
+ "step": 492
1465
+ },
1466
+ {
1467
+ "epoch": 6.53,
1468
+ "eval_loss": 0.622622013092041,
1469
+ "eval_runtime": 23.9341,
1470
+ "eval_samples_per_second": 125.344,
1471
+ "eval_steps_per_second": 3.927,
1472
+ "step": 495
1473
+ },
1474
+ {
1475
+ "epoch": 6.57,
1476
+ "eval_loss": 0.6228298544883728,
1477
+ "eval_runtime": 23.9079,
1478
+ "eval_samples_per_second": 125.482,
1479
+ "eval_steps_per_second": 3.932,
1480
+ "step": 498
1481
+ },
1482
+ {
1483
+ "epoch": 6.6,
1484
+ "learning_rate": 6.729475100942126e-05,
1485
+ "loss": 0.6043,
1486
+ "step": 500
1487
+ },
1488
+ {
1489
+ "epoch": 6.61,
1490
+ "eval_loss": 0.6232237815856934,
1491
+ "eval_runtime": 23.8982,
1492
+ "eval_samples_per_second": 125.533,
1493
+ "eval_steps_per_second": 3.933,
1494
+ "step": 501
1495
+ },
1496
+ {
1497
+ "epoch": 6.65,
1498
+ "eval_loss": 0.6218205690383911,
1499
+ "eval_runtime": 23.9059,
1500
+ "eval_samples_per_second": 125.492,
1501
+ "eval_steps_per_second": 3.932,
1502
+ "step": 504
1503
+ },
1504
+ {
1505
+ "epoch": 6.69,
1506
+ "eval_loss": 0.621903657913208,
1507
+ "eval_runtime": 23.8991,
1508
+ "eval_samples_per_second": 125.528,
1509
+ "eval_steps_per_second": 3.933,
1510
+ "step": 507
1511
+ },
1512
+ {
1513
+ "epoch": 6.73,
1514
+ "eval_loss": 0.622235894203186,
1515
+ "eval_runtime": 23.9024,
1516
+ "eval_samples_per_second": 125.51,
1517
+ "eval_steps_per_second": 3.933,
1518
+ "step": 510
1519
+ },
1520
+ {
1521
+ "epoch": 6.77,
1522
+ "eval_loss": 0.6220830082893372,
1523
+ "eval_runtime": 23.8926,
1524
+ "eval_samples_per_second": 125.562,
1525
+ "eval_steps_per_second": 3.934,
1526
+ "step": 513
1527
+ },
1528
+ {
1529
+ "epoch": 6.81,
1530
+ "eval_loss": 0.6220167875289917,
1531
+ "eval_runtime": 23.8965,
1532
+ "eval_samples_per_second": 125.542,
1533
+ "eval_steps_per_second": 3.934,
1534
+ "step": 516
1535
+ },
1536
+ {
1537
+ "epoch": 6.85,
1538
+ "eval_loss": 0.6222782135009766,
1539
+ "eval_runtime": 23.908,
1540
+ "eval_samples_per_second": 125.481,
1541
+ "eval_steps_per_second": 3.932,
1542
+ "step": 519
1543
+ },
1544
+ {
1545
+ "epoch": 6.86,
1546
+ "learning_rate": 6.191117092866757e-05,
1547
+ "loss": 0.6008,
1548
+ "step": 520
1549
+ },
1550
+ {
1551
+ "epoch": 6.89,
1552
+ "eval_loss": 0.6216304302215576,
1553
+ "eval_runtime": 23.9036,
1554
+ "eval_samples_per_second": 125.504,
1555
+ "eval_steps_per_second": 3.932,
1556
+ "step": 522
1557
+ },
1558
+ {
1559
+ "epoch": 6.93,
1560
+ "eval_loss": 0.6217759847640991,
1561
+ "eval_runtime": 23.9088,
1562
+ "eval_samples_per_second": 125.477,
1563
+ "eval_steps_per_second": 3.932,
1564
+ "step": 525
1565
+ },
1566
+ {
1567
+ "epoch": 6.97,
1568
+ "eval_loss": 0.6214317083358765,
1569
+ "eval_runtime": 23.9177,
1570
+ "eval_samples_per_second": 125.43,
1571
+ "eval_steps_per_second": 3.93,
1572
+ "step": 528
1573
+ },
1574
+ {
1575
+ "epoch": 7.01,
1576
+ "eval_loss": 0.6213416457176208,
1577
+ "eval_runtime": 23.9138,
1578
+ "eval_samples_per_second": 125.451,
1579
+ "eval_steps_per_second": 3.931,
1580
+ "step": 531
1581
+ },
1582
+ {
1583
+ "epoch": 7.05,
1584
+ "eval_loss": 0.6217712163925171,
1585
+ "eval_runtime": 23.9141,
1586
+ "eval_samples_per_second": 125.449,
1587
+ "eval_steps_per_second": 3.931,
1588
+ "step": 534
1589
+ },
1590
+ {
1591
+ "epoch": 7.09,
1592
+ "eval_loss": 0.6215860843658447,
1593
+ "eval_runtime": 23.9145,
1594
+ "eval_samples_per_second": 125.447,
1595
+ "eval_steps_per_second": 3.931,
1596
+ "step": 537
1597
+ },
1598
+ {
1599
+ "epoch": 7.13,
1600
+ "learning_rate": 5.652759084791387e-05,
1601
+ "loss": 0.599,
1602
+ "step": 540
1603
+ },
1604
+ {
1605
+ "epoch": 7.13,
1606
+ "eval_loss": 0.6211041808128357,
1607
+ "eval_runtime": 23.9125,
1608
+ "eval_samples_per_second": 125.457,
1609
+ "eval_steps_per_second": 3.931,
1610
+ "step": 540
1611
+ },
1612
+ {
1613
+ "epoch": 7.17,
1614
+ "eval_loss": 0.6210355758666992,
1615
+ "eval_runtime": 23.911,
1616
+ "eval_samples_per_second": 125.465,
1617
+ "eval_steps_per_second": 3.931,
1618
+ "step": 543
1619
+ },
1620
+ {
1621
+ "epoch": 7.2,
1622
+ "eval_loss": 0.6209889650344849,
1623
+ "eval_runtime": 23.9062,
1624
+ "eval_samples_per_second": 125.491,
1625
+ "eval_steps_per_second": 3.932,
1626
+ "step": 546
1627
+ },
1628
+ {
1629
+ "epoch": 7.24,
1630
+ "eval_loss": 0.6205114126205444,
1631
+ "eval_runtime": 23.9227,
1632
+ "eval_samples_per_second": 125.404,
1633
+ "eval_steps_per_second": 3.929,
1634
+ "step": 549
1635
+ },
1636
+ {
1637
+ "epoch": 7.28,
1638
+ "eval_loss": 0.6204013824462891,
1639
+ "eval_runtime": 23.9146,
1640
+ "eval_samples_per_second": 125.446,
1641
+ "eval_steps_per_second": 3.931,
1642
+ "step": 552
1643
+ },
1644
+ {
1645
+ "epoch": 7.32,
1646
+ "eval_loss": 0.6202988028526306,
1647
+ "eval_runtime": 23.9015,
1648
+ "eval_samples_per_second": 125.515,
1649
+ "eval_steps_per_second": 3.933,
1650
+ "step": 555
1651
+ },
1652
+ {
1653
+ "epoch": 7.36,
1654
+ "eval_loss": 0.6199727654457092,
1655
+ "eval_runtime": 23.9089,
1656
+ "eval_samples_per_second": 125.476,
1657
+ "eval_steps_per_second": 3.932,
1658
+ "step": 558
1659
+ },
1660
+ {
1661
+ "epoch": 7.39,
1662
+ "learning_rate": 5.1144010767160164e-05,
1663
+ "loss": 0.5959,
1664
+ "step": 560
1665
+ },
1666
+ {
1667
+ "epoch": 7.4,
1668
+ "eval_loss": 0.619968831539154,
1669
+ "eval_runtime": 23.9043,
1670
+ "eval_samples_per_second": 125.5,
1671
+ "eval_steps_per_second": 3.932,
1672
+ "step": 561
1673
+ },
1674
+ {
1675
+ "epoch": 7.44,
1676
+ "eval_loss": 0.6202374696731567,
1677
+ "eval_runtime": 23.9117,
1678
+ "eval_samples_per_second": 125.461,
1679
+ "eval_steps_per_second": 3.931,
1680
+ "step": 564
1681
+ },
1682
+ {
1683
+ "epoch": 7.48,
1684
+ "eval_loss": 0.6202066540718079,
1685
+ "eval_runtime": 23.908,
1686
+ "eval_samples_per_second": 125.481,
1687
+ "eval_steps_per_second": 3.932,
1688
+ "step": 567
1689
+ },
1690
+ {
1691
+ "epoch": 7.52,
1692
+ "eval_loss": 0.6198835968971252,
1693
+ "eval_runtime": 23.9244,
1694
+ "eval_samples_per_second": 125.395,
1695
+ "eval_steps_per_second": 3.929,
1696
+ "step": 570
1697
+ },
1698
+ {
1699
+ "epoch": 7.56,
1700
+ "eval_loss": 0.6199198961257935,
1701
+ "eval_runtime": 23.9263,
1702
+ "eval_samples_per_second": 125.385,
1703
+ "eval_steps_per_second": 3.929,
1704
+ "step": 573
1705
+ },
1706
+ {
1707
+ "epoch": 7.6,
1708
+ "eval_loss": 0.6195517182350159,
1709
+ "eval_runtime": 23.9125,
1710
+ "eval_samples_per_second": 125.457,
1711
+ "eval_steps_per_second": 3.931,
1712
+ "step": 576
1713
+ },
1714
+ {
1715
+ "epoch": 7.64,
1716
+ "eval_loss": 0.6192638278007507,
1717
+ "eval_runtime": 23.9168,
1718
+ "eval_samples_per_second": 125.435,
1719
+ "eval_steps_per_second": 3.93,
1720
+ "step": 579
1721
+ },
1722
+ {
1723
+ "epoch": 7.65,
1724
+ "learning_rate": 4.576043068640646e-05,
1725
+ "loss": 0.5922,
1726
+ "step": 580
1727
+ },
1728
+ {
1729
+ "epoch": 7.68,
1730
+ "eval_loss": 0.6196587085723877,
1731
+ "eval_runtime": 23.9305,
1732
+ "eval_samples_per_second": 125.363,
1733
+ "eval_steps_per_second": 3.928,
1734
+ "step": 582
1735
+ },
1736
+ {
1737
+ "epoch": 7.72,
1738
+ "eval_loss": 0.6198856830596924,
1739
+ "eval_runtime": 23.919,
1740
+ "eval_samples_per_second": 125.423,
1741
+ "eval_steps_per_second": 3.93,
1742
+ "step": 585
1743
+ },
1744
+ {
1745
+ "epoch": 7.76,
1746
+ "eval_loss": 0.6196783781051636,
1747
+ "eval_runtime": 23.9042,
1748
+ "eval_samples_per_second": 125.501,
1749
+ "eval_steps_per_second": 3.932,
1750
+ "step": 588
1751
+ },
1752
+ {
1753
+ "epoch": 7.8,
1754
+ "eval_loss": 0.6192678809165955,
1755
+ "eval_runtime": 23.9057,
1756
+ "eval_samples_per_second": 125.493,
1757
+ "eval_steps_per_second": 3.932,
1758
+ "step": 591
1759
+ },
1760
+ {
1761
+ "epoch": 7.84,
1762
+ "eval_loss": 0.6192264556884766,
1763
+ "eval_runtime": 23.909,
1764
+ "eval_samples_per_second": 125.476,
1765
+ "eval_steps_per_second": 3.932,
1766
+ "step": 594
1767
+ },
1768
+ {
1769
+ "epoch": 7.88,
1770
+ "eval_loss": 0.6192458271980286,
1771
+ "eval_runtime": 23.8917,
1772
+ "eval_samples_per_second": 125.567,
1773
+ "eval_steps_per_second": 3.934,
1774
+ "step": 597
1775
+ },
1776
+ {
1777
+ "epoch": 7.92,
1778
+ "learning_rate": 4.037685060565276e-05,
1779
+ "loss": 0.6028,
1780
+ "step": 600
1781
+ },
1782
+ {
1783
+ "epoch": 7.92,
1784
+ "eval_loss": 0.6192883849143982,
1785
+ "eval_runtime": 23.9005,
1786
+ "eval_samples_per_second": 125.521,
1787
+ "eval_steps_per_second": 3.933,
1788
+ "step": 600
1789
+ },
1790
+ {
1791
+ "epoch": 7.96,
1792
+ "eval_loss": 0.6194872856140137,
1793
+ "eval_runtime": 23.9118,
1794
+ "eval_samples_per_second": 125.461,
1795
+ "eval_steps_per_second": 3.931,
1796
+ "step": 603
1797
+ },
1798
+ {
1799
+ "epoch": 8.0,
1800
+ "eval_loss": 0.619368314743042,
1801
+ "eval_runtime": 23.9121,
1802
+ "eval_samples_per_second": 125.46,
1803
+ "eval_steps_per_second": 3.931,
1804
+ "step": 606
1805
+ },
1806
+ {
1807
+ "epoch": 8.04,
1808
+ "eval_loss": 0.6190740466117859,
1809
+ "eval_runtime": 23.9192,
1810
+ "eval_samples_per_second": 125.422,
1811
+ "eval_steps_per_second": 3.93,
1812
+ "step": 609
1813
+ },
1814
+ {
1815
+ "epoch": 8.08,
1816
+ "eval_loss": 0.6190269589424133,
1817
+ "eval_runtime": 23.916,
1818
+ "eval_samples_per_second": 125.439,
1819
+ "eval_steps_per_second": 3.93,
1820
+ "step": 612
1821
+ },
1822
+ {
1823
+ "epoch": 8.12,
1824
+ "eval_loss": 0.6189839839935303,
1825
+ "eval_runtime": 23.9063,
1826
+ "eval_samples_per_second": 125.49,
1827
+ "eval_steps_per_second": 3.932,
1828
+ "step": 615
1829
+ },
1830
+ {
1831
+ "epoch": 8.16,
1832
+ "eval_loss": 0.618523895740509,
1833
+ "eval_runtime": 23.8928,
1834
+ "eval_samples_per_second": 125.561,
1835
+ "eval_steps_per_second": 3.934,
1836
+ "step": 618
1837
+ },
1838
+ {
1839
+ "epoch": 8.18,
1840
+ "learning_rate": 3.499327052489906e-05,
1841
+ "loss": 0.5941,
1842
+ "step": 620
1843
+ },
1844
+ {
1845
+ "epoch": 8.19,
1846
+ "eval_loss": 0.6187476515769958,
1847
+ "eval_runtime": 23.9016,
1848
+ "eval_samples_per_second": 125.515,
1849
+ "eval_steps_per_second": 3.933,
1850
+ "step": 621
1851
+ },
1852
+ {
1853
+ "epoch": 8.23,
1854
+ "eval_loss": 0.6186578869819641,
1855
+ "eval_runtime": 23.9003,
1856
+ "eval_samples_per_second": 125.521,
1857
+ "eval_steps_per_second": 3.933,
1858
+ "step": 624
1859
+ },
1860
+ {
1861
+ "epoch": 8.27,
1862
+ "eval_loss": 0.6190162897109985,
1863
+ "eval_runtime": 23.9155,
1864
+ "eval_samples_per_second": 125.442,
1865
+ "eval_steps_per_second": 3.931,
1866
+ "step": 627
1867
+ },
1868
+ {
1869
+ "epoch": 8.31,
1870
+ "eval_loss": 0.6189883351325989,
1871
+ "eval_runtime": 23.9091,
1872
+ "eval_samples_per_second": 125.475,
1873
+ "eval_steps_per_second": 3.932,
1874
+ "step": 630
1875
+ },
1876
+ {
1877
+ "epoch": 8.35,
1878
+ "eval_loss": 0.6184096932411194,
1879
+ "eval_runtime": 23.9042,
1880
+ "eval_samples_per_second": 125.501,
1881
+ "eval_steps_per_second": 3.932,
1882
+ "step": 633
1883
+ },
1884
+ {
1885
+ "epoch": 8.39,
1886
+ "eval_loss": 0.6180031895637512,
1887
+ "eval_runtime": 23.9189,
1888
+ "eval_samples_per_second": 125.424,
1889
+ "eval_steps_per_second": 3.93,
1890
+ "step": 636
1891
+ },
1892
+ {
1893
+ "epoch": 8.43,
1894
+ "eval_loss": 0.6179867386817932,
1895
+ "eval_runtime": 23.922,
1896
+ "eval_samples_per_second": 125.407,
1897
+ "eval_steps_per_second": 3.929,
1898
+ "step": 639
1899
+ },
1900
+ {
1901
+ "epoch": 8.45,
1902
+ "learning_rate": 2.960969044414536e-05,
1903
+ "loss": 0.5906,
1904
+ "step": 640
1905
+ },
1906
+ {
1907
+ "epoch": 8.47,
1908
+ "eval_loss": 0.6182823777198792,
1909
+ "eval_runtime": 23.9158,
1910
+ "eval_samples_per_second": 125.44,
1911
+ "eval_steps_per_second": 3.93,
1912
+ "step": 642
1913
+ },
1914
+ {
1915
+ "epoch": 8.51,
1916
+ "eval_loss": 0.6179353594779968,
1917
+ "eval_runtime": 23.914,
1918
+ "eval_samples_per_second": 125.45,
1919
+ "eval_steps_per_second": 3.931,
1920
+ "step": 645
1921
+ },
1922
+ {
1923
+ "epoch": 8.55,
1924
+ "eval_loss": 0.6178385615348816,
1925
+ "eval_runtime": 23.9079,
1926
+ "eval_samples_per_second": 125.482,
1927
+ "eval_steps_per_second": 3.932,
1928
+ "step": 648
1929
+ },
1930
+ {
1931
+ "epoch": 8.59,
1932
+ "eval_loss": 0.6179868578910828,
1933
+ "eval_runtime": 23.9189,
1934
+ "eval_samples_per_second": 125.424,
1935
+ "eval_steps_per_second": 3.93,
1936
+ "step": 651
1937
+ },
1938
+ {
1939
+ "epoch": 8.63,
1940
+ "eval_loss": 0.6179595589637756,
1941
+ "eval_runtime": 23.9327,
1942
+ "eval_samples_per_second": 125.352,
1943
+ "eval_steps_per_second": 3.928,
1944
+ "step": 654
1945
+ },
1946
+ {
1947
+ "epoch": 8.67,
1948
+ "eval_loss": 0.6177854537963867,
1949
+ "eval_runtime": 23.9123,
1950
+ "eval_samples_per_second": 125.458,
1951
+ "eval_steps_per_second": 3.931,
1952
+ "step": 657
1953
+ },
1954
+ {
1955
+ "epoch": 8.71,
1956
+ "learning_rate": 2.422611036339166e-05,
1957
+ "loss": 0.5908,
1958
+ "step": 660
1959
+ },
1960
+ {
1961
+ "epoch": 8.71,
1962
+ "eval_loss": 0.6179735064506531,
1963
+ "eval_runtime": 23.9135,
1964
+ "eval_samples_per_second": 125.452,
1965
+ "eval_steps_per_second": 3.931,
1966
+ "step": 660
1967
+ },
1968
+ {
1969
+ "epoch": 8.75,
1970
+ "eval_loss": 0.6180996298789978,
1971
+ "eval_runtime": 23.9038,
1972
+ "eval_samples_per_second": 125.503,
1973
+ "eval_steps_per_second": 3.932,
1974
+ "step": 663
1975
+ },
1976
+ {
1977
+ "epoch": 8.79,
1978
+ "eval_loss": 0.6181532740592957,
1979
+ "eval_runtime": 23.9058,
1980
+ "eval_samples_per_second": 125.492,
1981
+ "eval_steps_per_second": 3.932,
1982
+ "step": 666
1983
+ },
1984
+ {
1985
+ "epoch": 8.83,
1986
+ "eval_loss": 0.6176265478134155,
1987
+ "eval_runtime": 23.9128,
1988
+ "eval_samples_per_second": 125.456,
1989
+ "eval_steps_per_second": 3.931,
1990
+ "step": 669
1991
+ },
1992
+ {
1993
+ "epoch": 8.87,
1994
+ "eval_loss": 0.617388904094696,
1995
+ "eval_runtime": 23.8906,
1996
+ "eval_samples_per_second": 125.572,
1997
+ "eval_steps_per_second": 3.935,
1998
+ "step": 672
1999
+ },
2000
+ {
2001
+ "epoch": 8.91,
2002
+ "eval_loss": 0.6174699068069458,
2003
+ "eval_runtime": 23.9051,
2004
+ "eval_samples_per_second": 125.496,
2005
+ "eval_steps_per_second": 3.932,
2006
+ "step": 675
2007
+ },
2008
+ {
2009
+ "epoch": 8.95,
2010
+ "eval_loss": 0.6174932718276978,
2011
+ "eval_runtime": 23.8969,
2012
+ "eval_samples_per_second": 125.54,
2013
+ "eval_steps_per_second": 3.934,
2014
+ "step": 678
2015
+ },
2016
+ {
2017
+ "epoch": 8.97,
2018
+ "learning_rate": 1.8842530282637954e-05,
2019
+ "loss": 0.5937,
2020
+ "step": 680
2021
+ },
2022
+ {
2023
+ "epoch": 8.99,
2024
+ "eval_loss": 0.6174784898757935,
2025
+ "eval_runtime": 23.9056,
2026
+ "eval_samples_per_second": 125.494,
2027
+ "eval_steps_per_second": 3.932,
2028
+ "step": 681
2029
+ },
2030
+ {
2031
+ "epoch": 9.03,
2032
+ "eval_loss": 0.6175104975700378,
2033
+ "eval_runtime": 23.9136,
2034
+ "eval_samples_per_second": 125.452,
2035
+ "eval_steps_per_second": 3.931,
2036
+ "step": 684
2037
+ },
2038
+ {
2039
+ "epoch": 9.07,
2040
+ "eval_loss": 0.6173563599586487,
2041
+ "eval_runtime": 23.9237,
2042
+ "eval_samples_per_second": 125.399,
2043
+ "eval_steps_per_second": 3.929,
2044
+ "step": 687
2045
+ },
2046
+ {
2047
+ "epoch": 9.11,
2048
+ "eval_loss": 0.6172643899917603,
2049
+ "eval_runtime": 23.9021,
2050
+ "eval_samples_per_second": 125.512,
2051
+ "eval_steps_per_second": 3.933,
2052
+ "step": 690
2053
+ },
2054
+ {
2055
+ "epoch": 9.14,
2056
+ "eval_loss": 0.6172318458557129,
2057
+ "eval_runtime": 23.9242,
2058
+ "eval_samples_per_second": 125.396,
2059
+ "eval_steps_per_second": 3.929,
2060
+ "step": 693
2061
+ },
2062
+ {
2063
+ "epoch": 9.18,
2064
+ "eval_loss": 0.617030680179596,
2065
+ "eval_runtime": 23.9184,
2066
+ "eval_samples_per_second": 125.427,
2067
+ "eval_steps_per_second": 3.93,
2068
+ "step": 696
2069
+ },
2070
+ {
2071
+ "epoch": 9.22,
2072
+ "eval_loss": 0.6169764995574951,
2073
+ "eval_runtime": 23.9104,
2074
+ "eval_samples_per_second": 125.468,
2075
+ "eval_steps_per_second": 3.931,
2076
+ "step": 699
2077
+ },
2078
+ {
2079
+ "epoch": 9.24,
2080
+ "learning_rate": 1.3458950201884254e-05,
2081
+ "loss": 0.5867,
2082
+ "step": 700
2083
+ },
2084
+ {
2085
+ "epoch": 9.26,
2086
+ "eval_loss": 0.6171083450317383,
2087
+ "eval_runtime": 23.9126,
2088
+ "eval_samples_per_second": 125.457,
2089
+ "eval_steps_per_second": 3.931,
2090
+ "step": 702
2091
+ },
2092
+ {
2093
+ "epoch": 9.3,
2094
+ "eval_loss": 0.6171473860740662,
2095
+ "eval_runtime": 23.9015,
2096
+ "eval_samples_per_second": 125.515,
2097
+ "eval_steps_per_second": 3.933,
2098
+ "step": 705
2099
+ },
2100
+ {
2101
+ "epoch": 9.34,
2102
+ "eval_loss": 0.6170982122421265,
2103
+ "eval_runtime": 23.9023,
2104
+ "eval_samples_per_second": 125.511,
2105
+ "eval_steps_per_second": 3.933,
2106
+ "step": 708
2107
+ },
2108
+ {
2109
+ "epoch": 9.38,
2110
+ "eval_loss": 0.6169420480728149,
2111
+ "eval_runtime": 23.9074,
2112
+ "eval_samples_per_second": 125.484,
2113
+ "eval_steps_per_second": 3.932,
2114
+ "step": 711
2115
+ },
2116
+ {
2117
+ "epoch": 9.42,
2118
+ "eval_loss": 0.6168730854988098,
2119
+ "eval_runtime": 23.9047,
2120
+ "eval_samples_per_second": 125.498,
2121
+ "eval_steps_per_second": 3.932,
2122
+ "step": 714
2123
+ },
2124
+ {
2125
+ "epoch": 9.46,
2126
+ "eval_loss": 0.6168663501739502,
2127
+ "eval_runtime": 23.9161,
2128
+ "eval_samples_per_second": 125.439,
2129
+ "eval_steps_per_second": 3.93,
2130
+ "step": 717
2131
+ },
2132
+ {
2133
+ "epoch": 9.5,
2134
+ "learning_rate": 8.075370121130552e-06,
2135
+ "loss": 0.5891,
2136
+ "step": 720
2137
+ },
2138
+ {
2139
+ "epoch": 9.5,
2140
+ "eval_loss": 0.6167708039283752,
2141
+ "eval_runtime": 23.9114,
2142
+ "eval_samples_per_second": 125.463,
2143
+ "eval_steps_per_second": 3.931,
2144
+ "step": 720
2145
+ },
2146
+ {
2147
+ "epoch": 9.54,
2148
+ "eval_loss": 0.6167441010475159,
2149
+ "eval_runtime": 23.9086,
2150
+ "eval_samples_per_second": 125.478,
2151
+ "eval_steps_per_second": 3.932,
2152
+ "step": 723
2153
+ },
2154
+ {
2155
+ "epoch": 9.58,
2156
+ "eval_loss": 0.6168937683105469,
2157
+ "eval_runtime": 23.9122,
2158
+ "eval_samples_per_second": 125.459,
2159
+ "eval_steps_per_second": 3.931,
2160
+ "step": 726
2161
+ },
2162
+ {
2163
+ "epoch": 9.62,
2164
+ "eval_loss": 0.6170821189880371,
2165
+ "eval_runtime": 23.9065,
2166
+ "eval_samples_per_second": 125.489,
2167
+ "eval_steps_per_second": 3.932,
2168
+ "step": 729
2169
+ },
2170
+ {
2171
+ "epoch": 9.66,
2172
+ "eval_loss": 0.6171652674674988,
2173
+ "eval_runtime": 23.9131,
2174
+ "eval_samples_per_second": 125.454,
2175
+ "eval_steps_per_second": 3.931,
2176
+ "step": 732
2177
+ },
2178
+ {
2179
+ "epoch": 9.7,
2180
+ "eval_loss": 0.6171714663505554,
2181
+ "eval_runtime": 23.9057,
2182
+ "eval_samples_per_second": 125.493,
2183
+ "eval_steps_per_second": 3.932,
2184
+ "step": 735
2185
+ },
2186
+ {
2187
+ "epoch": 9.74,
2188
+ "eval_loss": 0.6170787811279297,
2189
+ "eval_runtime": 23.928,
2190
+ "eval_samples_per_second": 125.376,
2191
+ "eval_steps_per_second": 3.928,
2192
+ "step": 738
2193
+ },
2194
+ {
2195
+ "epoch": 9.76,
2196
+ "learning_rate": 2.6917900403768505e-06,
2197
+ "loss": 0.5843,
2198
+ "step": 740
2199
+ },
2200
+ {
2201
+ "epoch": 9.78,
2202
+ "eval_loss": 0.6169885993003845,
2203
+ "eval_runtime": 23.9276,
2204
+ "eval_samples_per_second": 125.378,
2205
+ "eval_steps_per_second": 3.929,
2206
+ "step": 741
2207
+ }
2208
+ ],
2209
+ "max_steps": 750,
2210
+ "num_train_epochs": 10,
2211
+ "total_flos": 7.831063404654625e+18,
2212
+ "trial_name": null,
2213
+ "trial_params": null
2214
+ }
vicuna-7b_english-cot+auto-cot_0.0002/lora/checkpoint-741/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d5a386ce8addef927bca0e390e9534e0877a3e4e00f222f83967dd78c49527a
3
+ size 4027