krylat commited on
Commit
275bb87
1 Parent(s): 6474476

update to instruct model

Browse files
config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "_name_or_path": "meta-llama/meta-llama-3-8B",
3
  "architectures": [
4
  "LlamaForTokenClassification"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 128000,
9
- "eos_token_id": 128001,
10
  "hidden_act": "silu",
11
  "hidden_size": 4096,
12
  "id2label": {
@@ -20,6 +20,7 @@
20
  "1": 1
21
  },
22
  "max_position_embeddings": 8192,
 
23
  "model_type": "llama",
24
  "num_attention_heads": 32,
25
  "num_hidden_layers": 32,
@@ -30,7 +31,7 @@
30
  "rope_theta": 500000.0,
31
  "tie_word_embeddings": false,
32
  "torch_dtype": "bfloat16",
33
- "transformers_version": "4.40.2",
34
  "use_cache": true,
35
  "vocab_size": 128256
36
  }
 
1
  {
2
+ "_name_or_path": "meta-llama/meta-llama-3-8B-Instruct",
3
  "architectures": [
4
  "LlamaForTokenClassification"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 128000,
9
+ "eos_token_id": 128009,
10
  "hidden_act": "silu",
11
  "hidden_size": 4096,
12
  "id2label": {
 
20
  "1": 1
21
  },
22
  "max_position_embeddings": 8192,
23
+ "mlp_bias": false,
24
  "model_type": "llama",
25
  "num_attention_heads": 32,
26
  "num_hidden_layers": 32,
 
31
  "rope_theta": 500000.0,
32
  "tie_word_embeddings": false,
33
  "torch_dtype": "bfloat16",
34
+ "transformers_version": "4.41.1",
35
  "use_cache": true,
36
  "vocab_size": 128256
37
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2c144103072514542e327fa8080bd375cb300f2d453fba9ca3aea81d0d4cf33
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8cf9c4d0dd972e1a2131bfe656235ee98221679711a3beef6d46dadf0f20b5c
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9eee5f23d94405d90b7e9ff88b9443fee42f8528a658f54214c2aba7530d80c
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d4782b4a69ef03845159ce1a15e272aadaaf134dc138d68f616098e8531729c
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b8fbc5e113f69768dd8de84661ea20af8a32b734a9976144b4236c447b40ccc
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3acdd690e65c24f42a24581b8467af98bd3ca357444580f8012aacd2bd607921
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7098f8a123b9401d6d5b4630c5c331204a96e41ff9a55b21d17d4c039890e23b
3
  size 117482084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40e0d6b37568815888f7ab4c75dedaeef555e609deb2ea86901df415d70c9acb
3
  size 117482084
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f6d9c910463f9577dfa0729d98e43e952097e9029a4c1360b1284cc4aac6829
3
  size 35676
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a5397c647c43191a2dd8a57c46eabad77ab78eb886fb2155efb565d5e66c304
3
  size 35676
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1dbe76c2dc7721068d20ba0cbbc11b229d5c11383a839baef1f79fce84d9b904
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c525ef7132f585c38b4bfc5ae4f0355dfcc2530a841b09dae07ee95e86cb2de2
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b398016557f45ae11f4aaf508c18378d0be1e99dec77c80ea0fff0298f82bd0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d33b0edd5b6ac8e7325d969d8a731d29cfad089e3aa53b250b771d90d30a917
3
  size 1064
special_tokens_map.json CHANGED
@@ -7,11 +7,11 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|end_of_text|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|end_of_text|>"
17
  }
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|eot_id|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": "<|eot_id|>"
17
  }
tokenizer_config.json CHANGED
@@ -2050,14 +2050,15 @@
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
 
2053
  "clean_up_tokenization_spaces": true,
2054
- "eos_token": "<|end_of_text|>",
2055
  "model_input_names": [
2056
  "input_ids",
2057
  "attention_mask"
2058
  ],
2059
  "model_max_length": 1000000000000000019884624838656,
2060
- "pad_token": "<|end_of_text|>",
2061
  "padding": "longest",
2062
  "tokenizer_class": "PreTrainedTokenizerFast"
2063
  }
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
2054
  "clean_up_tokenization_spaces": true,
2055
+ "eos_token": "<|eot_id|>",
2056
  "model_input_names": [
2057
  "input_ids",
2058
  "attention_mask"
2059
  ],
2060
  "model_max_length": 1000000000000000019884624838656,
2061
+ "pad_token": "<|eot_id|>",
2062
  "padding": "longest",
2063
  "tokenizer_class": "PreTrainedTokenizerFast"
2064
  }
trainer_state.json CHANGED
@@ -2,133 +2,186 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
- "eval_steps": 500,
6
- "global_step": 3380,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.4792899408284024,
13
- "grad_norm": 3.84375,
14
- "learning_rate": 8.520710059171599e-05,
15
- "loss": 0.6478,
16
- "step": 500
 
 
 
 
 
17
  },
18
  {
19
- "epoch": 1.4792899408284024,
20
- "eval_accuracy": 0.8455031356898518,
21
- "eval_f1": 0.012843946055426568,
22
- "eval_loss": 0.4867057204246521,
23
- "eval_precision": 0.010016179983049541,
24
- "eval_recall": 0.01789647577092511,
25
- "eval_runtime": 95.5701,
26
- "eval_samples_per_second": 6.278,
27
- "eval_steps_per_second": 0.785,
28
- "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  },
30
  {
31
- "epoch": 2.9585798816568047,
32
- "grad_norm": 4.09375,
33
- "learning_rate": 7.041420118343195e-05,
34
- "loss": 0.5008,
35
- "step": 1000
36
  },
37
  {
38
- "epoch": 2.9585798816568047,
39
- "eval_accuracy": 0.8540407639680729,
40
- "eval_f1": 0.016587026967649805,
41
- "eval_loss": 0.4580599069595337,
42
- "eval_precision": 0.013798775472905053,
43
- "eval_recall": 0.020787444933920703,
44
- "eval_runtime": 95.5633,
45
- "eval_samples_per_second": 6.279,
46
- "eval_steps_per_second": 0.785,
47
- "step": 1000
48
  },
49
  {
50
- "epoch": 4.437869822485207,
51
- "grad_norm": 4.125,
52
- "learning_rate": 5.562130177514793e-05,
53
- "loss": 0.4718,
54
- "step": 1500
 
 
 
 
 
55
  },
56
  {
57
- "epoch": 4.437869822485207,
58
- "eval_accuracy": 0.8557155074116306,
59
- "eval_f1": 0.01738934056007227,
60
- "eval_loss": 0.44876301288604736,
61
- "eval_precision": 0.014739663093415008,
62
- "eval_recall": 0.021200440528634363,
63
- "eval_runtime": 95.5719,
64
- "eval_samples_per_second": 6.278,
65
- "eval_steps_per_second": 0.785,
66
- "step": 1500
67
  },
68
  {
69
- "epoch": 5.9171597633136095,
70
- "grad_norm": 5.0,
71
- "learning_rate": 4.0828402366863904e-05,
72
- "loss": 0.4697,
73
- "step": 2000
 
 
 
 
 
74
  },
75
  {
76
- "epoch": 5.9171597633136095,
77
- "eval_accuracy": 0.8567702394526796,
78
- "eval_f1": 0.016934408165632433,
79
- "eval_loss": 0.44674479961395264,
80
- "eval_precision": 0.01463072452149514,
81
- "eval_recall": 0.020099118942731278,
82
- "eval_runtime": 95.576,
83
- "eval_samples_per_second": 6.278,
84
- "eval_steps_per_second": 0.785,
85
- "step": 2000
86
  },
87
  {
88
- "epoch": 7.396449704142012,
89
- "grad_norm": 5.3125,
90
- "learning_rate": 2.6035502958579882e-05,
91
- "loss": 0.4663,
92
- "step": 2500
93
  },
94
  {
95
- "epoch": 7.396449704142012,
96
- "eval_accuracy": 0.8562428734321551,
97
- "eval_f1": 0.017444150582409543,
98
- "eval_loss": 0.4421093761920929,
99
- "eval_precision": 0.01475207004853907,
100
- "eval_recall": 0.021338105726872246,
101
- "eval_runtime": 95.5992,
102
- "eval_samples_per_second": 6.276,
103
- "eval_steps_per_second": 0.785,
104
- "step": 2500
105
  },
106
  {
107
- "epoch": 8.875739644970414,
108
- "grad_norm": 6.0,
109
- "learning_rate": 1.1242603550295859e-05,
110
- "loss": 0.4614,
111
- "step": 3000
 
 
 
 
 
112
  },
113
  {
114
- "epoch": 8.875739644970414,
115
- "eval_accuracy": 0.8567559863169898,
116
- "eval_f1": 0.01658185144948253,
117
- "eval_loss": 0.4435156285762787,
118
- "eval_precision": 0.014180929095354523,
119
- "eval_recall": 0.01996145374449339,
120
- "eval_runtime": 95.7263,
121
- "eval_samples_per_second": 6.268,
122
- "eval_steps_per_second": 0.783,
123
- "step": 3000
 
 
 
 
 
 
 
124
  }
125
  ],
126
- "logging_steps": 500,
127
- "max_steps": 3380,
128
  "num_input_tokens_seen": 0,
129
  "num_train_epochs": 10,
130
  "save_steps": 500,
131
- "total_flos": 2.351623317247311e+18,
 
 
 
 
 
 
 
 
 
 
 
 
132
  "train_batch_size": 16,
133
  "trial_name": null,
134
  "trial_params": null
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
+ "eval_steps": 50,
6
+ "global_step": 520,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.9615384615384616,
13
+ "eval_accuracy": 0.8201973056115414,
14
+ "eval_f1": 0.012405237767057204,
15
+ "eval_loss": 0.3012058436870575,
16
+ "eval_precision": 0.015254237288135594,
17
+ "eval_recall": 0.010452961672473868,
18
+ "eval_runtime": 2.9167,
19
+ "eval_samples_per_second": 31.542,
20
+ "eval_steps_per_second": 4.114,
21
+ "step": 50
22
  },
23
  {
24
+ "epoch": 1.9230769230769231,
25
+ "grad_norm": 1.078125,
26
+ "learning_rate": 8.076923076923078e-05,
27
+ "loss": 0.4442,
28
+ "step": 100
29
+ },
30
+ {
31
+ "epoch": 1.9230769230769231,
32
+ "eval_accuracy": 0.8290018033308582,
33
+ "eval_f1": 0.01993355481727575,
34
+ "eval_loss": 0.27755603194236755,
35
+ "eval_precision": 0.03498542274052478,
36
+ "eval_recall": 0.013937282229965157,
37
+ "eval_runtime": 2.9435,
38
+ "eval_samples_per_second": 31.255,
39
+ "eval_steps_per_second": 4.077,
40
+ "step": 100
41
+ },
42
+ {
43
+ "epoch": 2.8846153846153846,
44
+ "eval_accuracy": 0.8282592553304339,
45
+ "eval_f1": 0.020833333333333336,
46
+ "eval_loss": 0.25815218687057495,
47
+ "eval_precision": 0.03359173126614987,
48
+ "eval_recall": 0.015098722415795587,
49
+ "eval_runtime": 2.941,
50
+ "eval_samples_per_second": 31.282,
51
+ "eval_steps_per_second": 4.08,
52
+ "step": 150
53
  },
54
  {
55
+ "epoch": 3.8461538461538463,
56
+ "grad_norm": 1.171875,
57
+ "learning_rate": 6.153846153846155e-05,
58
+ "loss": 0.2901,
59
+ "step": 200
60
  },
61
  {
62
+ "epoch": 3.8461538461538463,
63
+ "eval_accuracy": 0.8308051341890315,
64
+ "eval_f1": 0.02389078498293516,
65
+ "eval_loss": 0.25513756275177,
66
+ "eval_precision": 0.04501607717041801,
67
+ "eval_recall": 0.016260162601626018,
68
+ "eval_runtime": 2.9552,
69
+ "eval_samples_per_second": 31.132,
70
+ "eval_steps_per_second": 4.061,
71
+ "step": 200
72
  },
73
  {
74
+ "epoch": 4.8076923076923075,
75
+ "eval_accuracy": 0.8308051341890315,
76
+ "eval_f1": 0.022298456260720412,
77
+ "eval_loss": 0.2511464059352875,
78
+ "eval_precision": 0.04262295081967213,
79
+ "eval_recall": 0.015098722415795587,
80
+ "eval_runtime": 2.9617,
81
+ "eval_samples_per_second": 31.064,
82
+ "eval_steps_per_second": 4.052,
83
+ "step": 250
84
  },
85
  {
86
+ "epoch": 5.769230769230769,
87
+ "grad_norm": 1.3515625,
88
+ "learning_rate": 4.230769230769231e-05,
89
+ "loss": 0.2775,
90
+ "step": 300
 
 
 
 
 
91
  },
92
  {
93
+ "epoch": 5.769230769230769,
94
+ "eval_accuracy": 0.8308051341890315,
95
+ "eval_f1": 0.02181208053691275,
96
+ "eval_loss": 0.24583899974822998,
97
+ "eval_precision": 0.03927492447129909,
98
+ "eval_recall": 0.015098722415795587,
99
+ "eval_runtime": 2.9496,
100
+ "eval_samples_per_second": 31.19,
101
+ "eval_steps_per_second": 4.068,
102
+ "step": 300
103
  },
104
  {
105
+ "epoch": 6.730769230769231,
106
+ "eval_accuracy": 0.8309112124748064,
107
+ "eval_f1": 0.020477815699658706,
108
+ "eval_loss": 0.24579653143882751,
109
+ "eval_precision": 0.03858520900321544,
110
+ "eval_recall": 0.013937282229965157,
111
+ "eval_runtime": 2.9607,
112
+ "eval_samples_per_second": 31.074,
113
+ "eval_steps_per_second": 4.053,
114
+ "step": 350
115
  },
116
  {
117
+ "epoch": 7.6923076923076925,
118
+ "grad_norm": 1.5625,
119
+ "learning_rate": 2.307692307692308e-05,
120
+ "loss": 0.2677,
121
+ "step": 400
122
  },
123
  {
124
+ "epoch": 7.6923076923076925,
125
+ "eval_accuracy": 0.8305929776174817,
126
+ "eval_f1": 0.020168067226890754,
127
+ "eval_loss": 0.24363110959529877,
128
+ "eval_precision": 0.0364741641337386,
129
+ "eval_recall": 0.013937282229965157,
130
+ "eval_runtime": 2.955,
131
+ "eval_samples_per_second": 31.133,
132
+ "eval_steps_per_second": 4.061,
133
+ "step": 400
134
  },
135
  {
136
+ "epoch": 8.653846153846153,
137
+ "eval_accuracy": 0.8306990559032567,
138
+ "eval_f1": 0.020202020202020204,
139
+ "eval_loss": 0.24341881275177002,
140
+ "eval_precision": 0.03669724770642202,
141
+ "eval_recall": 0.013937282229965157,
142
+ "eval_runtime": 2.9545,
143
+ "eval_samples_per_second": 31.138,
144
+ "eval_steps_per_second": 4.062,
145
+ "step": 450
146
  },
147
  {
148
+ "epoch": 9.615384615384615,
149
+ "grad_norm": 1.9453125,
150
+ "learning_rate": 3.846153846153847e-06,
151
+ "loss": 0.2664,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 9.615384615384615,
156
+ "eval_accuracy": 0.8309112124748064,
157
+ "eval_f1": 0.02027027027027027,
158
+ "eval_loss": 0.24354620277881622,
159
+ "eval_precision": 0.03715170278637771,
160
+ "eval_recall": 0.013937282229965157,
161
+ "eval_runtime": 3.1385,
162
+ "eval_samples_per_second": 29.313,
163
+ "eval_steps_per_second": 3.823,
164
+ "step": 500
165
  }
166
  ],
167
+ "logging_steps": 100,
168
+ "max_steps": 520,
169
  "num_input_tokens_seen": 0,
170
  "num_train_epochs": 10,
171
  "save_steps": 500,
172
+ "stateful_callbacks": {
173
+ "TrainerControl": {
174
+ "args": {
175
+ "should_epoch_stop": false,
176
+ "should_evaluate": false,
177
+ "should_log": false,
178
+ "should_save": true,
179
+ "should_training_stop": true
180
+ },
181
+ "attributes": {}
182
+ }
183
+ },
184
+ "total_flos": 4.476838693139221e+16,
185
  "train_batch_size": 16,
186
  "trial_name": null,
187
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c62084dc75fa1981b3f4998b0236d32b0af168ce250dbc7124b6db0630753d05
3
- size 4984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1e2a0fefa001714bb24115f77edcee86e4a522f49c3a2ee60ab00018a3414ec
3
+ size 5112