pkarypis commited on
Commit
7a975b9
1 Parent(s): 141d5b3

Model save

Browse files
README.md CHANGED
@@ -2,15 +2,11 @@
2
  license: apache-2.0
3
  base_model: JackFram/llama-68m
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
12
  datasets:
13
- - GAIR/lima
14
  model-index:
15
  - name: gpt2-sft-lima
16
  results: []
@@ -21,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # gpt2-sft-lima
23
 
24
- This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on the GAIR/lima dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 5.4846
27
 
28
  ## Model description
29
 
@@ -46,6 +42,10 @@ The following hyperparameters were used during training:
46
  - train_batch_size: 32
47
  - eval_batch_size: 8
48
  - seed: 42
 
 
 
 
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
@@ -55,13 +55,13 @@ The following hyperparameters were used during training:
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:-----:|:----:|:---------------:|
58
- | 3.4931 | 1.0 | 24 | 5.5962 |
59
- | 3.4278 | 2.0 | 48 | 5.4846 |
60
 
61
 
62
  ### Framework versions
63
 
64
- - Transformers 4.40.1
65
  - Pytorch 2.1.2
66
  - Datasets 2.14.6
67
- - Tokenizers 0.19.1
 
2
  license: apache-2.0
3
  base_model: JackFram/llama-68m
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: gpt2-sft-lima
12
  results: []
 
17
 
18
  # gpt2-sft-lima
19
 
20
+ This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on the generator dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 5.5083
23
 
24
  ## Model description
25
 
 
42
  - train_batch_size: 32
43
  - eval_batch_size: 8
44
  - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 4
47
+ - total_train_batch_size: 128
48
+ - total_eval_batch_size: 32
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
 
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:-----:|:----:|:---------------:|
58
+ | 3.6492 | 1.0 | 6 | 5.7124 |
59
+ | 3.457 | 2.0 | 12 | 5.5083 |
60
 
61
 
62
  ### Framework versions
63
 
64
+ - Transformers 4.38.2
65
  - Pytorch 2.1.2
66
  - Datasets 2.14.6
67
+ - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,14 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_loss": 5.484647274017334,
4
- "eval_runtime": 0.045,
5
- "eval_samples": 300,
6
- "eval_samples_per_second": 266.847,
7
- "eval_steps_per_second": 44.475,
8
- "total_flos": 394600362541056.0,
9
- "train_loss": 3.4869212259848914,
10
- "train_runtime": 14.4504,
11
  "train_samples": 1030,
12
- "train_samples_per_second": 102.281,
13
- "train_steps_per_second": 3.322
14
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 3.5401106079419455,
4
+ "train_runtime": 10.4273,
 
 
 
 
 
 
5
  "train_samples": 1030,
6
+ "train_samples_per_second": 141.744,
7
+ "train_steps_per_second": 1.151
8
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "rope_theta": 10000.0,
24
  "tie_word_embeddings": false,
25
  "torch_dtype": "bfloat16",
26
- "transformers_version": "4.40.1",
27
- "use_cache": true,
28
  "vocab_size": 32000
29
  }
 
23
  "rope_theta": 10000.0,
24
  "tie_word_embeddings": false,
25
  "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.38.2",
27
+ "use_cache": false,
28
  "vocab_size": 32000
29
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 0,
4
  "eos_token_id": 2,
5
  "pad_token_id": 1,
6
- "transformers_version": "4.40.1"
7
  }
 
3
  "bos_token_id": 0,
4
  "eos_token_id": 2,
5
  "pad_token_id": 1,
6
+ "transformers_version": "4.38.2"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8806bf32cff7ec1911f05503bac5b2c0db8784dc06f7b1ec97ab24b7c4851c2
3
  size 136062744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08f0b8f3e809f4bed4446ca5ad5539a8088deadbf434fda96d9dee7d88c1749f
3
  size 136062744
runs/Apr25_15-24-23_aga39/events.out.tfevents.1714076671.aga39.828093.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df9530b3fa9dddfd0c29be52b1d8862925277dea587a847edc911e8c1f0a6b57
3
+ size 6203
tokenizer.json CHANGED
@@ -134,7 +134,6 @@
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
137
- "ignore_merges": false,
138
  "vocab": {
139
  "<unk>": 0,
140
  "<s>": 1,
 
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
 
137
  "vocab": {
138
  "<unk>": 0,
139
  "<s>": 1,
train_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "total_flos": 394600362541056.0,
4
- "train_loss": 3.4869212259848914,
5
- "train_runtime": 14.4504,
6
  "train_samples": 1030,
7
- "train_samples_per_second": 102.281,
8
- "train_steps_per_second": 3.322
9
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 3.5401106079419455,
4
+ "train_runtime": 10.4273,
 
5
  "train_samples": 1030,
6
+ "train_samples_per_second": 141.744,
7
+ "train_steps_per_second": 1.151
8
  }
trainer_state.json CHANGED
@@ -3,113 +3,64 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 2.0,
5
  "eval_steps": 500,
6
- "global_step": 48,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.041666666666666664,
13
- "grad_norm": 6.875,
14
- "learning_rate": 4.000000000000001e-06,
15
- "loss": 3.886,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.20833333333333334,
20
- "grad_norm": 5.15625,
21
- "learning_rate": 2e-05,
22
- "loss": 3.5195,
23
  "step": 5
24
  },
25
- {
26
- "epoch": 0.4166666666666667,
27
- "grad_norm": 3.546875,
28
- "learning_rate": 1.9340161087325483e-05,
29
- "loss": 3.6848,
30
- "step": 10
31
- },
32
- {
33
- "epoch": 0.625,
34
- "grad_norm": 3.0625,
35
- "learning_rate": 1.744772182743782e-05,
36
- "loss": 3.5055,
37
- "step": 15
38
- },
39
- {
40
- "epoch": 0.8333333333333334,
41
- "grad_norm": 3.0,
42
- "learning_rate": 1.4572423233046386e-05,
43
- "loss": 3.4931,
44
- "step": 20
45
- },
46
  {
47
  "epoch": 1.0,
48
- "eval_loss": 5.596227169036865,
49
- "eval_runtime": 0.0423,
50
- "eval_samples_per_second": 283.93,
51
- "eval_steps_per_second": 47.322,
52
- "step": 24
53
- },
54
- {
55
- "epoch": 1.0416666666666667,
56
- "grad_norm": 2.90625,
57
- "learning_rate": 1.1093712083778748e-05,
58
- "loss": 3.4722,
59
- "step": 25
60
- },
61
- {
62
- "epoch": 1.25,
63
- "grad_norm": 2.828125,
64
- "learning_rate": 7.470666176083193e-06,
65
- "loss": 3.4108,
66
- "step": 30
67
- },
68
- {
69
- "epoch": 1.4583333333333333,
70
- "grad_norm": 2.5,
71
- "learning_rate": 4.181410844420473e-06,
72
- "loss": 3.4187,
73
- "step": 35
74
- },
75
- {
76
- "epoch": 1.6666666666666665,
77
- "grad_norm": 2.8125,
78
- "learning_rate": 1.660021821101222e-06,
79
- "loss": 3.4852,
80
- "step": 40
81
  },
82
  {
83
- "epoch": 1.875,
84
- "grad_norm": 2.484375,
85
- "learning_rate": 2.392412244407294e-07,
86
- "loss": 3.4278,
87
- "step": 45
88
  },
89
  {
90
  "epoch": 2.0,
91
- "eval_loss": 5.484647274017334,
92
- "eval_runtime": 0.0423,
93
- "eval_samples_per_second": 283.999,
94
- "eval_steps_per_second": 47.333,
95
- "step": 48
96
  },
97
  {
98
  "epoch": 2.0,
99
- "step": 48,
100
- "total_flos": 394600362541056.0,
101
- "train_loss": 3.4869212259848914,
102
- "train_runtime": 14.4504,
103
- "train_samples_per_second": 102.281,
104
- "train_steps_per_second": 3.322
105
  }
106
  ],
107
  "logging_steps": 5,
108
- "max_steps": 48,
109
  "num_input_tokens_seen": 0,
110
  "num_train_epochs": 2,
111
  "save_steps": 1000000000,
112
- "total_flos": 394600362541056.0,
113
  "train_batch_size": 32,
114
  "trial_name": null,
115
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 2.0,
5
  "eval_steps": 500,
6
+ "global_step": 12,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.17,
13
+ "grad_norm": 5.802258301213437,
14
+ "learning_rate": 1e-05,
15
+ "loss": 3.714,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.83,
20
+ "grad_norm": 4.228032304421419,
21
+ "learning_rate": 1.5877852522924733e-05,
22
+ "loss": 3.6492,
23
  "step": 5
24
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  {
26
  "epoch": 1.0,
27
+ "eval_loss": 5.712366104125977,
28
+ "eval_runtime": 0.0483,
29
+ "eval_samples_per_second": 248.688,
30
+ "eval_steps_per_second": 20.724,
31
+ "step": 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
  {
34
+ "epoch": 1.67,
35
+ "grad_norm": 2.84117612385776,
36
+ "learning_rate": 1.9098300562505266e-06,
37
+ "loss": 3.457,
38
+ "step": 10
39
  },
40
  {
41
  "epoch": 2.0,
42
+ "eval_loss": 5.508264064788818,
43
+ "eval_runtime": 0.0486,
44
+ "eval_samples_per_second": 247.068,
45
+ "eval_steps_per_second": 20.589,
46
+ "step": 12
47
  },
48
  {
49
  "epoch": 2.0,
50
+ "step": 12,
51
+ "total_flos": 36238786560.0,
52
+ "train_loss": 3.5401106079419455,
53
+ "train_runtime": 10.4273,
54
+ "train_samples_per_second": 141.744,
55
+ "train_steps_per_second": 1.151
56
  }
57
  ],
58
  "logging_steps": 5,
59
+ "max_steps": 12,
60
  "num_input_tokens_seen": 0,
61
  "num_train_epochs": 2,
62
  "save_steps": 1000000000,
63
+ "total_flos": 36238786560.0,
64
  "train_batch_size": 32,
65
  "trial_name": null,
66
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2c9ebcd5bf35bdc52b59db2bc7861d16777727aab05faf5f6fe289107775724
3
- size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43e697c2bae2940ee6c5029cb760ba9a42f675be9a07073967082c9baeb458d4
3
+ size 6072