ChenWu98 commited on
Commit
65e2720
1 Parent(s): bd18f0b

Model save

Browse files
Files changed (5) hide show
  1. README.md +5 -13
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +5 -5
  5. trainer_state.json +27 -67
README.md CHANGED
@@ -2,15 +2,9 @@
2
  license: mit
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
- - trl
10
- - sft
11
- - generated_from_trainer
12
- datasets:
13
- - ChenWu98/skills_red_herring_metaphor_chat
14
  base_model: HuggingFaceH4/zephyr-7b-beta
15
  model-index:
16
  - name: skills_red_herring_metaphor_chat-lora
@@ -22,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # skills_red_herring_metaphor_chat-lora
24
 
25
- This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the ChenWu98/skills_red_herring_metaphor_chat dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.2900
28
 
29
  ## Model description
30
 
@@ -53,16 +47,14 @@ The following hyperparameters were used during training:
53
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
  - lr_scheduler_type: cosine
55
  - lr_scheduler_warmup_ratio: 0.1
56
- - num_epochs: 4.0
57
 
58
  ### Training results
59
 
60
  | Training Loss | Epoch | Step | Validation Loss |
61
  |:-------------:|:-----:|:----:|:---------------:|
62
- | 1.8767 | 0.96 | 9 | 0.4226 |
63
- | 0.3349 | 1.92 | 18 | 0.3096 |
64
- | 0.2678 | 2.99 | 28 | 0.2913 |
65
- | 0.2538 | 3.84 | 36 | 0.2900 |
66
 
67
 
68
  ### Framework versions
 
2
  license: mit
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
 
 
 
8
  base_model: HuggingFaceH4/zephyr-7b-beta
9
  model-index:
10
  - name: skills_red_herring_metaphor_chat-lora
 
16
 
17
  # skills_red_herring_metaphor_chat-lora
18
 
19
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.3410
22
 
23
  ## Model description
24
 
 
47
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
  - lr_scheduler_type: cosine
49
  - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 2.0
51
 
52
  ### Training results
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:-----:|:----:|:---------------:|
56
+ | 1.6966 | 0.96 | 9 | 0.4003 |
57
+ | 0.3419 | 1.92 | 18 | 0.3410 |
 
 
58
 
59
 
60
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 3.84,
3
- "eval_loss": 0.2900236248970032,
4
- "eval_runtime": 9.9443,
5
  "eval_samples": 100,
6
- "eval_samples_per_second": 10.056,
7
- "eval_steps_per_second": 1.307,
8
- "train_loss": 0.5664321829875311,
9
- "train_runtime": 739.8338,
10
  "train_samples": 300,
11
- "train_samples_per_second": 1.622,
12
- "train_steps_per_second": 0.049
13
  }
 
1
  {
2
+ "epoch": 1.92,
3
+ "eval_loss": 0.3410404324531555,
4
+ "eval_runtime": 5.4206,
5
  "eval_samples": 100,
6
+ "eval_samples_per_second": 18.448,
7
+ "eval_steps_per_second": 2.398,
8
+ "train_loss": 0.8035296764638689,
9
+ "train_runtime": 392.0592,
10
  "train_samples": 300,
11
+ "train_samples_per_second": 1.53,
12
+ "train_steps_per_second": 0.046
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.84,
3
- "eval_loss": 0.2900236248970032,
4
- "eval_runtime": 9.9443,
5
  "eval_samples": 100,
6
- "eval_samples_per_second": 10.056,
7
- "eval_steps_per_second": 1.307
8
  }
 
1
  {
2
+ "epoch": 1.92,
3
+ "eval_loss": 0.3410404324531555,
4
+ "eval_runtime": 5.4206,
5
  "eval_samples": 100,
6
+ "eval_samples_per_second": 18.448,
7
+ "eval_steps_per_second": 2.398
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.84,
3
- "train_loss": 0.5664321829875311,
4
- "train_runtime": 739.8338,
5
  "train_samples": 300,
6
- "train_samples_per_second": 1.622,
7
- "train_steps_per_second": 0.049
8
  }
 
1
  {
2
+ "epoch": 1.92,
3
+ "train_loss": 0.8035296764638689,
4
+ "train_runtime": 392.0592,
5
  "train_samples": 300,
6
+ "train_samples_per_second": 1.53,
7
+ "train_steps_per_second": 0.046
8
  }
trainer_state.json CHANGED
@@ -1,109 +1,69 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.84,
5
  "eval_steps": 500,
6
- "global_step": 36,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.11,
13
- "learning_rate": 5e-05,
14
  "loss": 2.1556,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.53,
19
- "learning_rate": 0.0001995184726672197,
20
- "loss": 1.8767,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.96,
25
- "eval_loss": 0.4225959777832031,
26
- "eval_runtime": 8.0339,
27
- "eval_samples_per_second": 12.447,
28
- "eval_steps_per_second": 1.618,
29
  "step": 9
30
  },
31
  {
32
  "epoch": 1.07,
33
- "learning_rate": 0.00018314696123025454,
34
- "loss": 0.685,
35
  "step": 10
36
  },
37
  {
38
  "epoch": 1.6,
39
- "learning_rate": 0.0001471396736825998,
40
- "loss": 0.3349,
41
  "step": 15
42
  },
43
  {
44
  "epoch": 1.92,
45
- "eval_loss": 0.30958235263824463,
46
- "eval_runtime": 4.9729,
47
- "eval_samples_per_second": 20.109,
48
- "eval_steps_per_second": 2.614,
49
  "step": 18
50
  },
51
  {
52
- "epoch": 2.13,
53
- "learning_rate": 0.0001,
54
- "loss": 0.2955,
55
- "step": 20
56
- },
57
- {
58
- "epoch": 2.67,
59
- "learning_rate": 5.286032631740023e-05,
60
- "loss": 0.2678,
61
- "step": 25
62
- },
63
- {
64
- "epoch": 2.99,
65
- "eval_loss": 0.2913205325603485,
66
- "eval_runtime": 8.0251,
67
- "eval_samples_per_second": 12.461,
68
- "eval_steps_per_second": 1.62,
69
- "step": 28
70
- },
71
- {
72
- "epoch": 3.2,
73
- "learning_rate": 1.6853038769745467e-05,
74
- "loss": 0.2584,
75
- "step": 30
76
- },
77
- {
78
- "epoch": 3.73,
79
- "learning_rate": 4.815273327803182e-07,
80
- "loss": 0.2538,
81
- "step": 35
82
- },
83
- {
84
- "epoch": 3.84,
85
- "eval_loss": 0.2900236248970032,
86
- "eval_runtime": 5.7265,
87
- "eval_samples_per_second": 17.463,
88
- "eval_steps_per_second": 2.27,
89
- "step": 36
90
- },
91
- {
92
- "epoch": 3.84,
93
- "step": 36,
94
- "total_flos": 55410626723840.0,
95
- "train_loss": 0.5664321829875311,
96
- "train_runtime": 739.8338,
97
- "train_samples_per_second": 1.622,
98
- "train_steps_per_second": 0.049
99
  }
100
  ],
101
  "logging_steps": 5,
102
- "max_steps": 36,
103
  "num_input_tokens_seen": 0,
104
- "num_train_epochs": 4,
105
  "save_steps": 500,
106
- "total_flos": 55410626723840.0,
107
  "train_batch_size": 4,
108
  "trial_name": null,
109
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.92,
5
  "eval_steps": 500,
6
+ "global_step": 18,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.11,
13
+ "learning_rate": 0.0001,
14
  "loss": 2.1556,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.53,
19
+ "learning_rate": 0.00018314696123025454,
20
+ "loss": 1.6966,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.96,
25
+ "eval_loss": 0.4002748727798462,
26
+ "eval_runtime": 6.1602,
27
+ "eval_samples_per_second": 16.233,
28
+ "eval_steps_per_second": 2.11,
29
  "step": 9
30
  },
31
  {
32
  "epoch": 1.07,
33
+ "learning_rate": 0.0001,
34
+ "loss": 0.5695,
35
  "step": 10
36
  },
37
  {
38
  "epoch": 1.6,
39
+ "learning_rate": 1.6853038769745467e-05,
40
+ "loss": 0.3419,
41
  "step": 15
42
  },
43
  {
44
  "epoch": 1.92,
45
+ "eval_loss": 0.3410404324531555,
46
+ "eval_runtime": 7.1037,
47
+ "eval_samples_per_second": 14.077,
48
+ "eval_steps_per_second": 1.83,
49
  "step": 18
50
  },
51
  {
52
+ "epoch": 1.92,
53
+ "step": 18,
54
+ "total_flos": 27578625622016.0,
55
+ "train_loss": 0.8035296764638689,
56
+ "train_runtime": 392.0592,
57
+ "train_samples_per_second": 1.53,
58
+ "train_steps_per_second": 0.046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
  ],
61
  "logging_steps": 5,
62
+ "max_steps": 18,
63
  "num_input_tokens_seen": 0,
64
+ "num_train_epochs": 2,
65
  "save_steps": 500,
66
+ "total_flos": 27578625622016.0,
67
  "train_batch_size": 4,
68
  "trial_name": null,
69
  "trial_params": null