sedrickkeh commited on
Commit
a371b3f
1 Parent(s): 6b82c31

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3
4
  base_model: meta-llama/Meta-Llama-3-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: model
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # model
17
 
18
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: nan
21
 
 
4
  base_model: meta-llama/Meta-Llama-3-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: model
 
16
 
17
  # model
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the PrincetonPLI/Instruct-SkillMix-SDD dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: nan
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 2.4,
3
  "eval_loss": NaN,
4
- "eval_runtime": 2.195,
5
- "eval_samples_per_second": 46.014,
6
- "eval_steps_per_second": 0.911,
7
- "total_flos": 4.249521430997238e+17,
8
- "train_loss": 51.85570949978299,
9
- "train_runtime": 609.9474,
10
- "train_samples_per_second": 9.439,
11
- "train_steps_per_second": 0.015
12
  }
 
1
  {
2
  "epoch": 2.4,
3
  "eval_loss": NaN,
4
+ "eval_runtime": 2.0141,
5
+ "eval_samples_per_second": 50.146,
6
+ "eval_steps_per_second": 0.993,
7
+ "total_flos": 4.249521302148219e+17,
8
+ "train_loss": 0.606980217827691,
9
+ "train_runtime": 621.1198,
10
+ "train_samples_per_second": 9.269,
11
+ "train_steps_per_second": 0.014
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.4,
3
  "eval_loss": NaN,
4
- "eval_runtime": 2.195,
5
- "eval_samples_per_second": 46.014,
6
- "eval_steps_per_second": 0.911
7
  }
 
1
  {
2
  "epoch": 2.4,
3
  "eval_loss": NaN,
4
+ "eval_runtime": 2.0141,
5
+ "eval_samples_per_second": 50.146,
6
+ "eval_steps_per_second": 0.993
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.4,
3
- "total_flos": 4.249521430997238e+17,
4
- "train_loss": 51.85570949978299,
5
- "train_runtime": 609.9474,
6
- "train_samples_per_second": 9.439,
7
- "train_steps_per_second": 0.015
8
  }
 
1
  {
2
  "epoch": 2.4,
3
+ "total_flos": 4.249521302148219e+17,
4
+ "train_loss": 0.606980217827691,
5
+ "train_runtime": 621.1198,
6
+ "train_samples_per_second": 9.269,
7
+ "train_steps_per_second": 0.014
8
  }
trainer_state.json CHANGED
@@ -8,41 +8,104 @@
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.8,
13
  "eval_loss": NaN,
14
- "eval_runtime": 2.4966,
15
- "eval_samples_per_second": 40.454,
16
- "eval_steps_per_second": 0.801,
17
  "step": 3
18
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  {
20
  "epoch": 1.8666666666666667,
21
  "eval_loss": NaN,
22
- "eval_runtime": 2.4813,
23
- "eval_samples_per_second": 40.705,
24
- "eval_steps_per_second": 0.806,
25
  "step": 7
26
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  {
28
  "epoch": 2.4,
29
  "eval_loss": NaN,
30
- "eval_runtime": 1.9517,
31
- "eval_samples_per_second": 51.75,
32
- "eval_steps_per_second": 1.025,
33
  "step": 9
34
  },
35
  {
36
  "epoch": 2.4,
37
  "step": 9,
38
- "total_flos": 4.249521430997238e+17,
39
- "train_loss": 51.85570949978299,
40
- "train_runtime": 609.9474,
41
- "train_samples_per_second": 9.439,
42
- "train_steps_per_second": 0.015
43
  }
44
  ],
45
- "logging_steps": 10,
46
  "max_steps": 9,
47
  "num_input_tokens_seen": 0,
48
  "num_train_epochs": 3,
@@ -59,7 +122,7 @@
59
  "attributes": {}
60
  }
61
  },
62
- "total_flos": 4.249521430997238e+17,
63
  "train_batch_size": 8,
64
  "trial_name": null,
65
  "trial_params": null
 
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.26666666666666666,
13
+ "grad_norm": 69.07160949707031,
14
+ "learning_rate": 5e-06,
15
+ "loss": 1.9082,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.5333333333333333,
20
+ "grad_norm": 28.931533813476562,
21
+ "learning_rate": 5e-06,
22
+ "loss": 1.829,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.8,
27
+ "grad_norm": NaN,
28
+ "learning_rate": 5e-06,
29
+ "loss": 1.7257,
30
+ "step": 3
31
+ },
32
  {
33
  "epoch": 0.8,
34
  "eval_loss": NaN,
35
+ "eval_runtime": 2.4891,
36
+ "eval_samples_per_second": 40.577,
37
+ "eval_steps_per_second": 0.804,
38
  "step": 3
39
  },
40
+ {
41
+ "epoch": 1.0666666666666667,
42
+ "grad_norm": NaN,
43
+ "learning_rate": 5e-06,
44
+ "loss": 0.0,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 1.3333333333333333,
49
+ "grad_norm": NaN,
50
+ "learning_rate": 5e-06,
51
+ "loss": 0.0,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 1.6,
56
+ "grad_norm": NaN,
57
+ "learning_rate": 5e-06,
58
+ "loss": 0.0,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 1.8666666666666667,
63
+ "grad_norm": NaN,
64
+ "learning_rate": 5e-06,
65
+ "loss": 0.0,
66
+ "step": 7
67
+ },
68
  {
69
  "epoch": 1.8666666666666667,
70
  "eval_loss": NaN,
71
+ "eval_runtime": 2.4704,
72
+ "eval_samples_per_second": 40.884,
73
+ "eval_steps_per_second": 0.81,
74
  "step": 7
75
  },
76
+ {
77
+ "epoch": 2.1333333333333333,
78
+ "grad_norm": NaN,
79
+ "learning_rate": 5e-06,
80
+ "loss": 0.0,
81
+ "step": 8
82
+ },
83
+ {
84
+ "epoch": 2.4,
85
+ "grad_norm": NaN,
86
+ "learning_rate": 5e-06,
87
+ "loss": 0.0,
88
+ "step": 9
89
+ },
90
  {
91
  "epoch": 2.4,
92
  "eval_loss": NaN,
93
+ "eval_runtime": 1.9225,
94
+ "eval_samples_per_second": 52.537,
95
+ "eval_steps_per_second": 1.04,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 2.4,
100
  "step": 9,
101
+ "total_flos": 4.249521302148219e+17,
102
+ "train_loss": 0.606980217827691,
103
+ "train_runtime": 621.1198,
104
+ "train_samples_per_second": 9.269,
105
+ "train_steps_per_second": 0.014
106
  }
107
  ],
108
+ "logging_steps": 1.0,
109
  "max_steps": 9,
110
  "num_input_tokens_seen": 0,
111
  "num_train_epochs": 3,
 
122
  "attributes": {}
123
  }
124
  },
125
+ "total_flos": 4.249521302148219e+17,
126
  "train_batch_size": 8,
127
  "trial_name": null,
128
  "trial_params": null
training_loss.png ADDED