Wenda Xu commited on
Commit
338afc4
1 Parent(s): 2825b5a

updates all files

Browse files
InstructScore.py CHANGED
@@ -12,6 +12,7 @@ MAX_TARGET_LENGTH = 512
12
  print("Max source length: ", MAX_SOURCE_LENGTH)
13
  print("MAX target length: ", MAX_TARGET_LENGTH)
14
 
 
15
  def smart_tokenizer_and_embedding_resize(
16
  special_tokens_dict: Dict,
17
  tokenizer: transformers.PreTrainedTokenizer,
@@ -28,14 +29,14 @@ def smart_tokenizer_and_embedding_resize(
28
  }
29
  )
30
 
31
- device_id = (
32
- torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
33
- )
34
 
35
  class InstructScore:
36
- def __init__(self):
37
  self.tokenizer = LlamaTokenizer.from_pretrained(
38
- "InstructScore_Tok", model_max_length=MAX_SOURCE_LENGTH, use_fast=False
39
  )
40
  # enable batch inference by left padding
41
  self.tokenizer.padding_side = "left"
@@ -44,12 +45,17 @@ class InstructScore:
44
  special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
45
  tokenizer=self.tokenizer,
46
  )
47
- self.model = LlamaForCausalLM.from_pretrained('InstructScore_English').to(device_id)
 
 
48
  self.model.eval()
 
49
  def score(self, ref_ls, out_ls):
50
- prompt_ls=\
51
- [f"You are evaluating Chinese-to-English Machine translation task. The correct translation is \"{ref}\". The model generated translation is \"{out}\". Please identify all errors within each model output, up to a maximum of five. For each error, please give me the corresponding error type, major/minor label, error location of the model generated translation and explanation for the error. Major errors can confuse or mislead the reader due to significant change in meaning, while minor\
52
- errors don't lead to loss of meaning but will be noticed." for ref, out in zip(ref_ls, out_ls)]
 
 
53
 
54
  with torch.no_grad():
55
  inputs = self.tokenizer(
@@ -69,17 +75,29 @@ class InstructScore:
69
  skip_special_tokens=True,
70
  clean_up_tokenization_spaces=True,
71
  )
72
- scores_ls = [(-1) * output.count("Major/minor: Minor") + (-5) * output.count("Major/minor: Major") for output in batch_outputs]
 
 
 
 
73
  return batch_outputs, scores_ls
74
 
 
75
  def main():
76
- refs = ["SEScore is a simple but effective next generation text generation evaluation metric", "SEScore it really works"]
77
- outs = ["SEScore is a simple effective text evaluation metric for next generation", "SEScore is not working"]
78
-
 
 
 
 
 
 
79
  scorer = InstructScore()
80
  batch_outputs, scores_ls = scorer.score(refs, outs)
81
  print(batch_outputs)
82
  print(scores_ls)
83
 
 
84
  if __name__ == "__main__":
85
  main()
 
12
  print("Max source length: ", MAX_SOURCE_LENGTH)
13
  print("MAX target length: ", MAX_TARGET_LENGTH)
14
 
15
+
16
  def smart_tokenizer_and_embedding_resize(
17
  special_tokens_dict: Dict,
18
  tokenizer: transformers.PreTrainedTokenizer,
 
29
  }
30
  )
31
 
32
+
33
+ device_id = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
34
+
35
 
36
  class InstructScore:
37
+ def __init__(self):
38
  self.tokenizer = LlamaTokenizer.from_pretrained(
39
+ "xu1998hz/InstructScore", model_max_length=MAX_SOURCE_LENGTH, use_fast=False
40
  )
41
  # enable batch inference by left padding
42
  self.tokenizer.padding_side = "left"
 
45
  special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
46
  tokenizer=self.tokenizer,
47
  )
48
+ self.model = LlamaForCausalLM.from_pretrained("xu1998hz/InstructScore").to(
49
+ device_id
50
+ )
51
  self.model.eval()
52
+
53
  def score(self, ref_ls, out_ls):
54
+ prompt_ls = [
55
+ f'You are evaluating Chinese-to-English Machine translation task. The correct translation is "{ref}". The model generated translation is "{out}". Please identify all errors within each model output, up to a maximum of five. For each error, please give me the corresponding error type, major/minor label, error location of the model generated translation and explanation for the error. Major errors can confuse or mislead the reader due to significant change in meaning, while minor\
56
+ errors don\'t lead to loss of meaning but will be noticed.'
57
+ for ref, out in zip(ref_ls, out_ls)
58
+ ]
59
 
60
  with torch.no_grad():
61
  inputs = self.tokenizer(
 
75
  skip_special_tokens=True,
76
  clean_up_tokenization_spaces=True,
77
  )
78
+ scores_ls = [
79
+ (-1) * output.count("Major/minor: Minor")
80
+ + (-5) * output.count("Major/minor: Major")
81
+ for output in batch_outputs
82
+ ]
83
  return batch_outputs, scores_ls
84
 
85
+
86
  def main():
87
+ refs = [
88
+ "SEScore is a simple but effective next generation text generation evaluation metric",
89
+ "SEScore it really works",
90
+ ]
91
+ outs = [
92
+ "SEScore is a simple effective text evaluation metric for next generation",
93
+ "SEScore is not working",
94
+ ]
95
+
96
  scorer = InstructScore()
97
  batch_outputs, scores_ls = scorer.score(refs, outs)
98
  print(batch_outputs)
99
  print(scores_ls)
100
 
101
+
102
  if __name__ == "__main__":
103
  main()
InstructScore_English/added_tokens.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "[PAD]": 32000
3
- }
 
 
 
 
InstructScore_English/config.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "_name_or_path": "decapoda-research/llama-7b-hf",
3
- "architectures": [
4
- "LlamaForCausalLM"
5
- ],
6
- "bos_token_id": 0,
7
- "eos_token_id": 1,
8
- "hidden_act": "silu",
9
- "hidden_size": 4096,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 11008,
12
- "max_position_embeddings": 2048,
13
- "max_sequence_length": 2048,
14
- "model_type": "llama",
15
- "num_attention_heads": 32,
16
- "num_hidden_layers": 32,
17
- "pad_token_id": -1,
18
- "rms_norm_eps": 1e-06,
19
- "tie_word_embeddings": false,
20
- "torch_dtype": "float16",
21
- "transformers_version": "4.28.0.dev0",
22
- "use_cache": true,
23
- "vocab_size": 32001
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
InstructScore_English/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 0,
4
- "eos_token_id": 1,
5
- "pad_token_id": 0,
6
- "transformers_version": "4.28.0.dev0"
7
- }
 
 
 
 
 
 
 
 
InstructScore_English/latest DELETED
@@ -1 +0,0 @@
1
- global_step222
 
 
InstructScore_English/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:275ee8dcae406d0cf4af5a06830a049419b441f034e370831a6a9ef90da84625
3
- size 13476958625
 
 
 
 
InstructScore_English/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e3c5cb412e12159a59afe5657ce4b5e0a06e7fb420bedbb5228fe1245702762
3
- size 14583
 
 
 
 
InstructScore_English/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:741230672078323886b763e522c728741456a587860909fc529ce815a7aca5ec
3
- size 14583
 
 
 
 
InstructScore_English/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ea587886b41579993bb5d20c79047b968ae2d71d22ba4c739b07ce31d7486a6
3
- size 14583
 
 
 
 
InstructScore_English/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ab727740f74dd67e60283d27b4339609a1dda888b067cc06520e2f1d7dc17db
3
- size 14583
 
 
 
 
InstructScore_English/rng_state_4.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:85fbffd04f81c5419775f5b8507ac368aa0ff88b146755becf7b9cd26c139501
3
- size 14583
 
 
 
 
InstructScore_English/rng_state_5.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcfb5fd9b58d3febef1eee1e52cb6997af12bac5fccd4be5e31c38721f4c3410
3
- size 14583
 
 
 
 
InstructScore_English/rng_state_6.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f120fd79fe083c8bfcd736e4bd37c6fe37ba8dc492e550f309fb809413a218d
3
- size 14583
 
 
 
 
InstructScore_English/rng_state_7.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aed17a3c5698176f919ced6408c261f11e642658a3bdc526ead16625e7bb4a6f
3
- size 14583
 
 
 
 
InstructScore_English/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "</s>",
3
- "eos_token": "</s>",
4
- "pad_token": "[PAD]",
5
- "unk_token": "</s>"
6
- }
 
 
 
 
 
 
 
InstructScore_English/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
InstructScore_English/tokenizer_config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "clean_up_tokenization_spaces": false,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "model_max_length": 702,
22
- "pad_token": null,
23
- "padding_side": "right",
24
- "sp_model_kwargs": {},
25
- "tokenizer_class": "LlamaTokenizer",
26
- "unk_token": {
27
- "__type": "AddedToken",
28
- "content": "",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false
33
- }
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
InstructScore_English/trainer_state.json DELETED
@@ -1,1348 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 2.9873843566021865,
5
- "global_step": 222,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 0,
13
- "loss": 9.0082,
14
- "step": 1
15
- },
16
- {
17
- "epoch": 0.03,
18
- "learning_rate": 0,
19
- "loss": 8.6216,
20
- "step": 2
21
- },
22
- {
23
- "epoch": 0.04,
24
- "learning_rate": 0,
25
- "loss": 8.9495,
26
- "step": 3
27
- },
28
- {
29
- "epoch": 0.05,
30
- "learning_rate": 0,
31
- "loss": 9.3486,
32
- "step": 4
33
- },
34
- {
35
- "epoch": 0.07,
36
- "learning_rate": 0,
37
- "loss": 9.1724,
38
- "step": 5
39
- },
40
- {
41
- "epoch": 0.08,
42
- "learning_rate": 0,
43
- "loss": 8.6433,
44
- "step": 6
45
- },
46
- {
47
- "epoch": 0.09,
48
- "learning_rate": 0.0,
49
- "loss": 8.872,
50
- "step": 7
51
- },
52
- {
53
- "epoch": 0.11,
54
- "learning_rate": 0.0,
55
- "loss": 10.083,
56
- "step": 8
57
- },
58
- {
59
- "epoch": 0.12,
60
- "learning_rate": 0.0,
61
- "loss": 10.3374,
62
- "step": 9
63
- },
64
- {
65
- "epoch": 0.13,
66
- "learning_rate": 2e-05,
67
- "loss": 9.8785,
68
- "step": 10
69
- },
70
- {
71
- "epoch": 0.15,
72
- "learning_rate": 2e-05,
73
- "loss": 10.5295,
74
- "step": 11
75
- },
76
- {
77
- "epoch": 0.16,
78
- "learning_rate": 2e-05,
79
- "loss": 7.6001,
80
- "step": 12
81
- },
82
- {
83
- "epoch": 0.17,
84
- "learning_rate": 2e-05,
85
- "loss": 7.0393,
86
- "step": 13
87
- },
88
- {
89
- "epoch": 0.19,
90
- "learning_rate": 2e-05,
91
- "loss": 6.9884,
92
- "step": 14
93
- },
94
- {
95
- "epoch": 0.2,
96
- "learning_rate": 2e-05,
97
- "loss": 6.9941,
98
- "step": 15
99
- },
100
- {
101
- "epoch": 0.22,
102
- "learning_rate": 2e-05,
103
- "loss": 7.235,
104
- "step": 16
105
- },
106
- {
107
- "epoch": 0.23,
108
- "learning_rate": 2e-05,
109
- "loss": 6.7456,
110
- "step": 17
111
- },
112
- {
113
- "epoch": 0.24,
114
- "learning_rate": 2e-05,
115
- "loss": 6.7632,
116
- "step": 18
117
- },
118
- {
119
- "epoch": 0.26,
120
- "learning_rate": 2e-05,
121
- "loss": 6.3373,
122
- "step": 19
123
- },
124
- {
125
- "epoch": 0.27,
126
- "learning_rate": 2e-05,
127
- "loss": 6.5851,
128
- "step": 20
129
- },
130
- {
131
- "epoch": 0.28,
132
- "learning_rate": 2e-05,
133
- "loss": 6.8799,
134
- "step": 21
135
- },
136
- {
137
- "epoch": 0.3,
138
- "learning_rate": 2e-05,
139
- "loss": 6.0525,
140
- "step": 22
141
- },
142
- {
143
- "epoch": 0.31,
144
- "learning_rate": 2e-05,
145
- "loss": 6.178,
146
- "step": 23
147
- },
148
- {
149
- "epoch": 0.32,
150
- "learning_rate": 2e-05,
151
- "loss": 6.6378,
152
- "step": 24
153
- },
154
- {
155
- "epoch": 0.34,
156
- "learning_rate": 2e-05,
157
- "loss": 6.1607,
158
- "step": 25
159
- },
160
- {
161
- "epoch": 0.35,
162
- "learning_rate": 2e-05,
163
- "loss": 6.2084,
164
- "step": 26
165
- },
166
- {
167
- "epoch": 0.36,
168
- "learning_rate": 2e-05,
169
- "loss": 5.7216,
170
- "step": 27
171
- },
172
- {
173
- "epoch": 0.38,
174
- "learning_rate": 2e-05,
175
- "loss": 5.5294,
176
- "step": 28
177
- },
178
- {
179
- "epoch": 0.39,
180
- "learning_rate": 2e-05,
181
- "loss": 5.6123,
182
- "step": 29
183
- },
184
- {
185
- "epoch": 0.4,
186
- "learning_rate": 2e-05,
187
- "loss": 5.3022,
188
- "step": 30
189
- },
190
- {
191
- "epoch": 0.42,
192
- "learning_rate": 2e-05,
193
- "loss": 5.2346,
194
- "step": 31
195
- },
196
- {
197
- "epoch": 0.43,
198
- "learning_rate": 2e-05,
199
- "loss": 5.268,
200
- "step": 32
201
- },
202
- {
203
- "epoch": 0.44,
204
- "learning_rate": 2e-05,
205
- "loss": 4.8788,
206
- "step": 33
207
- },
208
- {
209
- "epoch": 0.46,
210
- "learning_rate": 2e-05,
211
- "loss": 4.8944,
212
- "step": 34
213
- },
214
- {
215
- "epoch": 0.47,
216
- "learning_rate": 2e-05,
217
- "loss": 4.487,
218
- "step": 35
219
- },
220
- {
221
- "epoch": 0.48,
222
- "learning_rate": 2e-05,
223
- "loss": 4.3159,
224
- "step": 36
225
- },
226
- {
227
- "epoch": 0.5,
228
- "learning_rate": 2e-05,
229
- "loss": 4.1472,
230
- "step": 37
231
- },
232
- {
233
- "epoch": 0.51,
234
- "learning_rate": 2e-05,
235
- "loss": 4.2212,
236
- "step": 38
237
- },
238
- {
239
- "epoch": 0.52,
240
- "learning_rate": 2e-05,
241
- "loss": 4.1312,
242
- "step": 39
243
- },
244
- {
245
- "epoch": 0.54,
246
- "learning_rate": 2e-05,
247
- "loss": 3.9116,
248
- "step": 40
249
- },
250
- {
251
- "epoch": 0.55,
252
- "learning_rate": 2e-05,
253
- "loss": 3.8727,
254
- "step": 41
255
- },
256
- {
257
- "epoch": 0.57,
258
- "learning_rate": 2e-05,
259
- "loss": 3.4962,
260
- "step": 42
261
- },
262
- {
263
- "epoch": 0.58,
264
- "learning_rate": 2e-05,
265
- "loss": 3.2736,
266
- "step": 43
267
- },
268
- {
269
- "epoch": 0.59,
270
- "learning_rate": 2e-05,
271
- "loss": 3.1902,
272
- "step": 44
273
- },
274
- {
275
- "epoch": 0.61,
276
- "learning_rate": 2e-05,
277
- "loss": 3.0781,
278
- "step": 45
279
- },
280
- {
281
- "epoch": 0.62,
282
- "learning_rate": 2e-05,
283
- "loss": 2.9503,
284
- "step": 46
285
- },
286
- {
287
- "epoch": 0.63,
288
- "learning_rate": 2e-05,
289
- "loss": 2.6742,
290
- "step": 47
291
- },
292
- {
293
- "epoch": 0.65,
294
- "learning_rate": 2e-05,
295
- "loss": 2.5908,
296
- "step": 48
297
- },
298
- {
299
- "epoch": 0.66,
300
- "learning_rate": 2e-05,
301
- "loss": 2.4428,
302
- "step": 49
303
- },
304
- {
305
- "epoch": 0.67,
306
- "learning_rate": 2e-05,
307
- "loss": 2.1305,
308
- "step": 50
309
- },
310
- {
311
- "epoch": 0.69,
312
- "learning_rate": 2e-05,
313
- "loss": 2.0908,
314
- "step": 51
315
- },
316
- {
317
- "epoch": 0.7,
318
- "learning_rate": 2e-05,
319
- "loss": 1.8319,
320
- "step": 52
321
- },
322
- {
323
- "epoch": 0.71,
324
- "learning_rate": 2e-05,
325
- "loss": 1.6784,
326
- "step": 53
327
- },
328
- {
329
- "epoch": 0.73,
330
- "learning_rate": 2e-05,
331
- "loss": 1.553,
332
- "step": 54
333
- },
334
- {
335
- "epoch": 0.74,
336
- "learning_rate": 2e-05,
337
- "loss": 1.3679,
338
- "step": 55
339
- },
340
- {
341
- "epoch": 0.75,
342
- "learning_rate": 2e-05,
343
- "loss": 1.2306,
344
- "step": 56
345
- },
346
- {
347
- "epoch": 0.77,
348
- "learning_rate": 2e-05,
349
- "loss": 1.0531,
350
- "step": 57
351
- },
352
- {
353
- "epoch": 0.78,
354
- "learning_rate": 2e-05,
355
- "loss": 0.9721,
356
- "step": 58
357
- },
358
- {
359
- "epoch": 0.79,
360
- "learning_rate": 2e-05,
361
- "loss": 0.8153,
362
- "step": 59
363
- },
364
- {
365
- "epoch": 0.81,
366
- "learning_rate": 2e-05,
367
- "loss": 0.6768,
368
- "step": 60
369
- },
370
- {
371
- "epoch": 0.82,
372
- "learning_rate": 2e-05,
373
- "loss": 0.5962,
374
- "step": 61
375
- },
376
- {
377
- "epoch": 0.83,
378
- "learning_rate": 2e-05,
379
- "loss": 0.5061,
380
- "step": 62
381
- },
382
- {
383
- "epoch": 0.85,
384
- "learning_rate": 2e-05,
385
- "loss": 0.4319,
386
- "step": 63
387
- },
388
- {
389
- "epoch": 0.86,
390
- "learning_rate": 2e-05,
391
- "loss": 0.372,
392
- "step": 64
393
- },
394
- {
395
- "epoch": 0.87,
396
- "learning_rate": 2e-05,
397
- "loss": 0.327,
398
- "step": 65
399
- },
400
- {
401
- "epoch": 0.89,
402
- "learning_rate": 2e-05,
403
- "loss": 0.2799,
404
- "step": 66
405
- },
406
- {
407
- "epoch": 0.9,
408
- "learning_rate": 2e-05,
409
- "loss": 0.2455,
410
- "step": 67
411
- },
412
- {
413
- "epoch": 0.92,
414
- "learning_rate": 2e-05,
415
- "loss": 0.2267,
416
- "step": 68
417
- },
418
- {
419
- "epoch": 0.93,
420
- "learning_rate": 2e-05,
421
- "loss": 0.2177,
422
- "step": 69
423
- },
424
- {
425
- "epoch": 0.94,
426
- "learning_rate": 2e-05,
427
- "loss": 0.2029,
428
- "step": 70
429
- },
430
- {
431
- "epoch": 0.96,
432
- "learning_rate": 2e-05,
433
- "loss": 0.1958,
434
- "step": 71
435
- },
436
- {
437
- "epoch": 0.97,
438
- "learning_rate": 2e-05,
439
- "loss": 0.1748,
440
- "step": 72
441
- },
442
- {
443
- "epoch": 0.98,
444
- "learning_rate": 2e-05,
445
- "loss": 0.1772,
446
- "step": 73
447
- },
448
- {
449
- "epoch": 1.0,
450
- "learning_rate": 2e-05,
451
- "loss": 0.1639,
452
- "step": 74
453
- },
454
- {
455
- "epoch": 1.01,
456
- "learning_rate": 2e-05,
457
- "loss": 0.1495,
458
- "step": 75
459
- },
460
- {
461
- "epoch": 1.02,
462
- "learning_rate": 2e-05,
463
- "loss": 0.1595,
464
- "step": 76
465
- },
466
- {
467
- "epoch": 1.04,
468
- "learning_rate": 2e-05,
469
- "loss": 0.141,
470
- "step": 77
471
- },
472
- {
473
- "epoch": 1.05,
474
- "learning_rate": 2e-05,
475
- "loss": 0.1411,
476
- "step": 78
477
- },
478
- {
479
- "epoch": 1.06,
480
- "learning_rate": 2e-05,
481
- "loss": 0.1456,
482
- "step": 79
483
- },
484
- {
485
- "epoch": 1.08,
486
- "learning_rate": 2e-05,
487
- "loss": 0.1396,
488
- "step": 80
489
- },
490
- {
491
- "epoch": 1.09,
492
- "learning_rate": 2e-05,
493
- "loss": 0.1276,
494
- "step": 81
495
- },
496
- {
497
- "epoch": 1.1,
498
- "learning_rate": 2e-05,
499
- "loss": 0.1285,
500
- "step": 82
501
- },
502
- {
503
- "epoch": 1.12,
504
- "learning_rate": 2e-05,
505
- "loss": 0.1274,
506
- "step": 83
507
- },
508
- {
509
- "epoch": 1.13,
510
- "learning_rate": 2e-05,
511
- "loss": 0.1315,
512
- "step": 84
513
- },
514
- {
515
- "epoch": 1.14,
516
- "learning_rate": 2e-05,
517
- "loss": 0.1283,
518
- "step": 85
519
- },
520
- {
521
- "epoch": 1.16,
522
- "learning_rate": 2e-05,
523
- "loss": 0.1055,
524
- "step": 86
525
- },
526
- {
527
- "epoch": 1.17,
528
- "learning_rate": 2e-05,
529
- "loss": 0.1164,
530
- "step": 87
531
- },
532
- {
533
- "epoch": 1.18,
534
- "learning_rate": 2e-05,
535
- "loss": 0.1306,
536
- "step": 88
537
- },
538
- {
539
- "epoch": 1.2,
540
- "learning_rate": 2e-05,
541
- "loss": 0.121,
542
- "step": 89
543
- },
544
- {
545
- "epoch": 1.21,
546
- "learning_rate": 2e-05,
547
- "loss": 0.1275,
548
- "step": 90
549
- },
550
- {
551
- "epoch": 1.22,
552
- "learning_rate": 2e-05,
553
- "loss": 0.1081,
554
- "step": 91
555
- },
556
- {
557
- "epoch": 1.24,
558
- "learning_rate": 2e-05,
559
- "loss": 0.1114,
560
- "step": 92
561
- },
562
- {
563
- "epoch": 1.25,
564
- "learning_rate": 2e-05,
565
- "loss": 0.1186,
566
- "step": 93
567
- },
568
- {
569
- "epoch": 1.26,
570
- "learning_rate": 2e-05,
571
- "loss": 0.1097,
572
- "step": 94
573
- },
574
- {
575
- "epoch": 1.28,
576
- "learning_rate": 2e-05,
577
- "loss": 0.1163,
578
- "step": 95
579
- },
580
- {
581
- "epoch": 1.29,
582
- "learning_rate": 2e-05,
583
- "loss": 0.113,
584
- "step": 96
585
- },
586
- {
587
- "epoch": 1.31,
588
- "learning_rate": 2e-05,
589
- "loss": 0.1217,
590
- "step": 97
591
- },
592
- {
593
- "epoch": 1.32,
594
- "learning_rate": 2e-05,
595
- "loss": 0.1132,
596
- "step": 98
597
- },
598
- {
599
- "epoch": 1.33,
600
- "learning_rate": 2e-05,
601
- "loss": 0.1203,
602
- "step": 99
603
- },
604
- {
605
- "epoch": 1.35,
606
- "learning_rate": 2e-05,
607
- "loss": 0.1094,
608
- "step": 100
609
- },
610
- {
611
- "epoch": 1.36,
612
- "learning_rate": 2e-05,
613
- "loss": 0.1279,
614
- "step": 101
615
- },
616
- {
617
- "epoch": 1.37,
618
- "learning_rate": 2e-05,
619
- "loss": 0.1043,
620
- "step": 102
621
- },
622
- {
623
- "epoch": 1.39,
624
- "learning_rate": 2e-05,
625
- "loss": 0.1097,
626
- "step": 103
627
- },
628
- {
629
- "epoch": 1.4,
630
- "learning_rate": 2e-05,
631
- "loss": 0.1125,
632
- "step": 104
633
- },
634
- {
635
- "epoch": 1.41,
636
- "learning_rate": 2e-05,
637
- "loss": 0.118,
638
- "step": 105
639
- },
640
- {
641
- "epoch": 1.43,
642
- "learning_rate": 2e-05,
643
- "loss": 0.0962,
644
- "step": 106
645
- },
646
- {
647
- "epoch": 1.44,
648
- "learning_rate": 2e-05,
649
- "loss": 0.1093,
650
- "step": 107
651
- },
652
- {
653
- "epoch": 1.45,
654
- "learning_rate": 2e-05,
655
- "loss": 0.1219,
656
- "step": 108
657
- },
658
- {
659
- "epoch": 1.47,
660
- "learning_rate": 2e-05,
661
- "loss": 0.1046,
662
- "step": 109
663
- },
664
- {
665
- "epoch": 1.48,
666
- "learning_rate": 2e-05,
667
- "loss": 0.1017,
668
- "step": 110
669
- },
670
- {
671
- "epoch": 1.49,
672
- "learning_rate": 2e-05,
673
- "loss": 0.1044,
674
- "step": 111
675
- },
676
- {
677
- "epoch": 1.51,
678
- "learning_rate": 2e-05,
679
- "loss": 0.1084,
680
- "step": 112
681
- },
682
- {
683
- "epoch": 1.52,
684
- "learning_rate": 2e-05,
685
- "loss": 0.1158,
686
- "step": 113
687
- },
688
- {
689
- "epoch": 1.53,
690
- "learning_rate": 2e-05,
691
- "loss": 0.1074,
692
- "step": 114
693
- },
694
- {
695
- "epoch": 1.55,
696
- "learning_rate": 2e-05,
697
- "loss": 0.1075,
698
- "step": 115
699
- },
700
- {
701
- "epoch": 1.56,
702
- "learning_rate": 2e-05,
703
- "loss": 0.1052,
704
- "step": 116
705
- },
706
- {
707
- "epoch": 1.57,
708
- "learning_rate": 2e-05,
709
- "loss": 0.1017,
710
- "step": 117
711
- },
712
- {
713
- "epoch": 1.59,
714
- "learning_rate": 2e-05,
715
- "loss": 0.0992,
716
- "step": 118
717
- },
718
- {
719
- "epoch": 1.6,
720
- "learning_rate": 2e-05,
721
- "loss": 0.1096,
722
- "step": 119
723
- },
724
- {
725
- "epoch": 1.61,
726
- "learning_rate": 2e-05,
727
- "loss": 0.1272,
728
- "step": 120
729
- },
730
- {
731
- "epoch": 1.63,
732
- "learning_rate": 2e-05,
733
- "loss": 0.1049,
734
- "step": 121
735
- },
736
- {
737
- "epoch": 1.64,
738
- "learning_rate": 2e-05,
739
- "loss": 0.0998,
740
- "step": 122
741
- },
742
- {
743
- "epoch": 1.66,
744
- "learning_rate": 2e-05,
745
- "loss": 0.1052,
746
- "step": 123
747
- },
748
- {
749
- "epoch": 1.67,
750
- "learning_rate": 2e-05,
751
- "loss": 0.1177,
752
- "step": 124
753
- },
754
- {
755
- "epoch": 1.68,
756
- "learning_rate": 2e-05,
757
- "loss": 0.1052,
758
- "step": 125
759
- },
760
- {
761
- "epoch": 1.7,
762
- "learning_rate": 2e-05,
763
- "loss": 0.0997,
764
- "step": 126
765
- },
766
- {
767
- "epoch": 1.71,
768
- "learning_rate": 2e-05,
769
- "loss": 0.1115,
770
- "step": 127
771
- },
772
- {
773
- "epoch": 1.72,
774
- "learning_rate": 2e-05,
775
- "loss": 0.109,
776
- "step": 128
777
- },
778
- {
779
- "epoch": 1.74,
780
- "learning_rate": 2e-05,
781
- "loss": 0.1094,
782
- "step": 129
783
- },
784
- {
785
- "epoch": 1.75,
786
- "learning_rate": 2e-05,
787
- "loss": 0.0988,
788
- "step": 130
789
- },
790
- {
791
- "epoch": 1.76,
792
- "learning_rate": 2e-05,
793
- "loss": 0.1088,
794
- "step": 131
795
- },
796
- {
797
- "epoch": 1.78,
798
- "learning_rate": 2e-05,
799
- "loss": 0.0988,
800
- "step": 132
801
- },
802
- {
803
- "epoch": 1.79,
804
- "learning_rate": 2e-05,
805
- "loss": 0.0931,
806
- "step": 133
807
- },
808
- {
809
- "epoch": 1.8,
810
- "learning_rate": 2e-05,
811
- "loss": 0.0989,
812
- "step": 134
813
- },
814
- {
815
- "epoch": 1.82,
816
- "learning_rate": 2e-05,
817
- "loss": 0.1099,
818
- "step": 135
819
- },
820
- {
821
- "epoch": 1.83,
822
- "learning_rate": 2e-05,
823
- "loss": 0.107,
824
- "step": 136
825
- },
826
- {
827
- "epoch": 1.84,
828
- "learning_rate": 2e-05,
829
- "loss": 0.0991,
830
- "step": 137
831
- },
832
- {
833
- "epoch": 1.86,
834
- "learning_rate": 2e-05,
835
- "loss": 0.1045,
836
- "step": 138
837
- },
838
- {
839
- "epoch": 1.87,
840
- "learning_rate": 2e-05,
841
- "loss": 0.1112,
842
- "step": 139
843
- },
844
- {
845
- "epoch": 1.88,
846
- "learning_rate": 2e-05,
847
- "loss": 0.1128,
848
- "step": 140
849
- },
850
- {
851
- "epoch": 1.9,
852
- "learning_rate": 2e-05,
853
- "loss": 0.1106,
854
- "step": 141
855
- },
856
- {
857
- "epoch": 1.91,
858
- "learning_rate": 2e-05,
859
- "loss": 0.0934,
860
- "step": 142
861
- },
862
- {
863
- "epoch": 1.92,
864
- "learning_rate": 2e-05,
865
- "loss": 0.1105,
866
- "step": 143
867
- },
868
- {
869
- "epoch": 1.94,
870
- "learning_rate": 2e-05,
871
- "loss": 0.1024,
872
- "step": 144
873
- },
874
- {
875
- "epoch": 1.95,
876
- "learning_rate": 2e-05,
877
- "loss": 0.0985,
878
- "step": 145
879
- },
880
- {
881
- "epoch": 1.96,
882
- "learning_rate": 2e-05,
883
- "loss": 0.1051,
884
- "step": 146
885
- },
886
- {
887
- "epoch": 1.98,
888
- "learning_rate": 2e-05,
889
- "loss": 0.0955,
890
- "step": 147
891
- },
892
- {
893
- "epoch": 1.99,
894
- "learning_rate": 2e-05,
895
- "loss": 0.0915,
896
- "step": 148
897
- },
898
- {
899
- "epoch": 2.01,
900
- "learning_rate": 2e-05,
901
- "loss": 0.0985,
902
- "step": 149
903
- },
904
- {
905
- "epoch": 2.02,
906
- "learning_rate": 2e-05,
907
- "loss": 0.0678,
908
- "step": 150
909
- },
910
- {
911
- "epoch": 2.03,
912
- "learning_rate": 2e-05,
913
- "loss": 0.0667,
914
- "step": 151
915
- },
916
- {
917
- "epoch": 2.05,
918
- "learning_rate": 2e-05,
919
- "loss": 0.0706,
920
- "step": 152
921
- },
922
- {
923
- "epoch": 2.06,
924
- "learning_rate": 2e-05,
925
- "loss": 0.0689,
926
- "step": 153
927
- },
928
- {
929
- "epoch": 2.07,
930
- "learning_rate": 2e-05,
931
- "loss": 0.0661,
932
- "step": 154
933
- },
934
- {
935
- "epoch": 2.09,
936
- "learning_rate": 2e-05,
937
- "loss": 0.0705,
938
- "step": 155
939
- },
940
- {
941
- "epoch": 2.1,
942
- "learning_rate": 2e-05,
943
- "loss": 0.077,
944
- "step": 156
945
- },
946
- {
947
- "epoch": 2.11,
948
- "learning_rate": 2e-05,
949
- "loss": 0.0605,
950
- "step": 157
951
- },
952
- {
953
- "epoch": 2.13,
954
- "learning_rate": 2e-05,
955
- "loss": 0.0653,
956
- "step": 158
957
- },
958
- {
959
- "epoch": 2.14,
960
- "learning_rate": 2e-05,
961
- "loss": 0.07,
962
- "step": 159
963
- },
964
- {
965
- "epoch": 2.15,
966
- "learning_rate": 2e-05,
967
- "loss": 0.0694,
968
- "step": 160
969
- },
970
- {
971
- "epoch": 2.17,
972
- "learning_rate": 2e-05,
973
- "loss": 0.0655,
974
- "step": 161
975
- },
976
- {
977
- "epoch": 2.18,
978
- "learning_rate": 2e-05,
979
- "loss": 0.061,
980
- "step": 162
981
- },
982
- {
983
- "epoch": 2.19,
984
- "learning_rate": 2e-05,
985
- "loss": 0.0799,
986
- "step": 163
987
- },
988
- {
989
- "epoch": 2.21,
990
- "learning_rate": 2e-05,
991
- "loss": 0.0637,
992
- "step": 164
993
- },
994
- {
995
- "epoch": 2.22,
996
- "learning_rate": 2e-05,
997
- "loss": 0.0711,
998
- "step": 165
999
- },
1000
- {
1001
- "epoch": 2.23,
1002
- "learning_rate": 2e-05,
1003
- "loss": 0.0668,
1004
- "step": 166
1005
- },
1006
- {
1007
- "epoch": 2.25,
1008
- "learning_rate": 2e-05,
1009
- "loss": 0.0699,
1010
- "step": 167
1011
- },
1012
- {
1013
- "epoch": 2.26,
1014
- "learning_rate": 2e-05,
1015
- "loss": 0.0748,
1016
- "step": 168
1017
- },
1018
- {
1019
- "epoch": 2.27,
1020
- "learning_rate": 2e-05,
1021
- "loss": 0.0614,
1022
- "step": 169
1023
- },
1024
- {
1025
- "epoch": 2.29,
1026
- "learning_rate": 2e-05,
1027
- "loss": 0.0676,
1028
- "step": 170
1029
- },
1030
- {
1031
- "epoch": 2.3,
1032
- "learning_rate": 2e-05,
1033
- "loss": 0.0628,
1034
- "step": 171
1035
- },
1036
- {
1037
- "epoch": 2.31,
1038
- "learning_rate": 2e-05,
1039
- "loss": 0.0642,
1040
- "step": 172
1041
- },
1042
- {
1043
- "epoch": 2.33,
1044
- "learning_rate": 2e-05,
1045
- "loss": 0.0678,
1046
- "step": 173
1047
- },
1048
- {
1049
- "epoch": 2.34,
1050
- "learning_rate": 2e-05,
1051
- "loss": 0.0629,
1052
- "step": 174
1053
- },
1054
- {
1055
- "epoch": 2.35,
1056
- "learning_rate": 2e-05,
1057
- "loss": 0.0645,
1058
- "step": 175
1059
- },
1060
- {
1061
- "epoch": 2.37,
1062
- "learning_rate": 2e-05,
1063
- "loss": 0.0612,
1064
- "step": 176
1065
- },
1066
- {
1067
- "epoch": 2.38,
1068
- "learning_rate": 2e-05,
1069
- "loss": 0.0575,
1070
- "step": 177
1071
- },
1072
- {
1073
- "epoch": 2.4,
1074
- "learning_rate": 2e-05,
1075
- "loss": 0.0651,
1076
- "step": 178
1077
- },
1078
- {
1079
- "epoch": 2.41,
1080
- "learning_rate": 2e-05,
1081
- "loss": 0.0679,
1082
- "step": 179
1083
- },
1084
- {
1085
- "epoch": 2.42,
1086
- "learning_rate": 2e-05,
1087
- "loss": 0.0648,
1088
- "step": 180
1089
- },
1090
- {
1091
- "epoch": 2.44,
1092
- "learning_rate": 2e-05,
1093
- "loss": 0.0685,
1094
- "step": 181
1095
- },
1096
- {
1097
- "epoch": 2.45,
1098
- "learning_rate": 2e-05,
1099
- "loss": 0.0646,
1100
- "step": 182
1101
- },
1102
- {
1103
- "epoch": 2.46,
1104
- "learning_rate": 2e-05,
1105
- "loss": 0.0688,
1106
- "step": 183
1107
- },
1108
- {
1109
- "epoch": 2.48,
1110
- "learning_rate": 2e-05,
1111
- "loss": 0.068,
1112
- "step": 184
1113
- },
1114
- {
1115
- "epoch": 2.49,
1116
- "learning_rate": 2e-05,
1117
- "loss": 0.067,
1118
- "step": 185
1119
- },
1120
- {
1121
- "epoch": 2.5,
1122
- "learning_rate": 2e-05,
1123
- "loss": 0.0683,
1124
- "step": 186
1125
- },
1126
- {
1127
- "epoch": 2.52,
1128
- "learning_rate": 2e-05,
1129
- "loss": 0.0668,
1130
- "step": 187
1131
- },
1132
- {
1133
- "epoch": 2.53,
1134
- "learning_rate": 2e-05,
1135
- "loss": 0.0663,
1136
- "step": 188
1137
- },
1138
- {
1139
- "epoch": 2.54,
1140
- "learning_rate": 2e-05,
1141
- "loss": 0.0636,
1142
- "step": 189
1143
- },
1144
- {
1145
- "epoch": 2.56,
1146
- "learning_rate": 2e-05,
1147
- "loss": 0.0667,
1148
- "step": 190
1149
- },
1150
- {
1151
- "epoch": 2.57,
1152
- "learning_rate": 2e-05,
1153
- "loss": 0.0688,
1154
- "step": 191
1155
- },
1156
- {
1157
- "epoch": 2.58,
1158
- "learning_rate": 2e-05,
1159
- "loss": 0.0708,
1160
- "step": 192
1161
- },
1162
- {
1163
- "epoch": 2.6,
1164
- "learning_rate": 2e-05,
1165
- "loss": 0.0666,
1166
- "step": 193
1167
- },
1168
- {
1169
- "epoch": 2.61,
1170
- "learning_rate": 2e-05,
1171
- "loss": 0.0679,
1172
- "step": 194
1173
- },
1174
- {
1175
- "epoch": 2.62,
1176
- "learning_rate": 2e-05,
1177
- "loss": 0.0675,
1178
- "step": 195
1179
- },
1180
- {
1181
- "epoch": 2.64,
1182
- "learning_rate": 2e-05,
1183
- "loss": 0.0688,
1184
- "step": 196
1185
- },
1186
- {
1187
- "epoch": 2.65,
1188
- "learning_rate": 2e-05,
1189
- "loss": 0.068,
1190
- "step": 197
1191
- },
1192
- {
1193
- "epoch": 2.66,
1194
- "learning_rate": 2e-05,
1195
- "loss": 0.0665,
1196
- "step": 198
1197
- },
1198
- {
1199
- "epoch": 2.68,
1200
- "learning_rate": 2e-05,
1201
- "loss": 0.0678,
1202
- "step": 199
1203
- },
1204
- {
1205
- "epoch": 2.69,
1206
- "learning_rate": 2e-05,
1207
- "loss": 0.0674,
1208
- "step": 200
1209
- },
1210
- {
1211
- "epoch": 2.7,
1212
- "learning_rate": 2e-05,
1213
- "loss": 0.0627,
1214
- "step": 201
1215
- },
1216
- {
1217
- "epoch": 2.72,
1218
- "learning_rate": 2e-05,
1219
- "loss": 0.0676,
1220
- "step": 202
1221
- },
1222
- {
1223
- "epoch": 2.73,
1224
- "learning_rate": 2e-05,
1225
- "loss": 0.0605,
1226
- "step": 203
1227
- },
1228
- {
1229
- "epoch": 2.75,
1230
- "learning_rate": 2e-05,
1231
- "loss": 0.0656,
1232
- "step": 204
1233
- },
1234
- {
1235
- "epoch": 2.76,
1236
- "learning_rate": 2e-05,
1237
- "loss": 0.072,
1238
- "step": 205
1239
- },
1240
- {
1241
- "epoch": 2.77,
1242
- "learning_rate": 2e-05,
1243
- "loss": 0.0635,
1244
- "step": 206
1245
- },
1246
- {
1247
- "epoch": 2.79,
1248
- "learning_rate": 2e-05,
1249
- "loss": 0.0624,
1250
- "step": 207
1251
- },
1252
- {
1253
- "epoch": 2.8,
1254
- "learning_rate": 2e-05,
1255
- "loss": 0.0628,
1256
- "step": 208
1257
- },
1258
- {
1259
- "epoch": 2.81,
1260
- "learning_rate": 2e-05,
1261
- "loss": 0.0704,
1262
- "step": 209
1263
- },
1264
- {
1265
- "epoch": 2.83,
1266
- "learning_rate": 2e-05,
1267
- "loss": 0.0647,
1268
- "step": 210
1269
- },
1270
- {
1271
- "epoch": 2.84,
1272
- "learning_rate": 2e-05,
1273
- "loss": 0.0611,
1274
- "step": 211
1275
- },
1276
- {
1277
- "epoch": 2.85,
1278
- "learning_rate": 2e-05,
1279
- "loss": 0.0686,
1280
- "step": 212
1281
- },
1282
- {
1283
- "epoch": 2.87,
1284
- "learning_rate": 2e-05,
1285
- "loss": 0.0686,
1286
- "step": 213
1287
- },
1288
- {
1289
- "epoch": 2.88,
1290
- "learning_rate": 2e-05,
1291
- "loss": 0.0723,
1292
- "step": 214
1293
- },
1294
- {
1295
- "epoch": 2.89,
1296
- "learning_rate": 2e-05,
1297
- "loss": 0.0644,
1298
- "step": 215
1299
- },
1300
- {
1301
- "epoch": 2.91,
1302
- "learning_rate": 2e-05,
1303
- "loss": 0.0702,
1304
- "step": 216
1305
- },
1306
- {
1307
- "epoch": 2.92,
1308
- "learning_rate": 2e-05,
1309
- "loss": 0.0624,
1310
- "step": 217
1311
- },
1312
- {
1313
- "epoch": 2.93,
1314
- "learning_rate": 2e-05,
1315
- "loss": 0.0664,
1316
- "step": 218
1317
- },
1318
- {
1319
- "epoch": 2.95,
1320
- "learning_rate": 2e-05,
1321
- "loss": 0.0592,
1322
- "step": 219
1323
- },
1324
- {
1325
- "epoch": 2.96,
1326
- "learning_rate": 2e-05,
1327
- "loss": 0.0565,
1328
- "step": 220
1329
- },
1330
- {
1331
- "epoch": 2.97,
1332
- "learning_rate": 2e-05,
1333
- "loss": 0.0679,
1334
- "step": 221
1335
- },
1336
- {
1337
- "epoch": 2.99,
1338
- "learning_rate": 2e-05,
1339
- "loss": 0.0617,
1340
- "step": 222
1341
- }
1342
- ],
1343
- "max_steps": 222,
1344
- "num_train_epochs": 3,
1345
- "total_flos": 31856813015040.0,
1346
- "trial_name": null,
1347
- "trial_params": null
1348
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
InstructScore_English/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e04a090494b1b9ef7c1035d6a1a84d50dd4eedd9b6ceab3e135a3f316ba05776
3
- size 4923
 
 
 
 
InstructScore_English/zero_to_fp32.py DELETED
@@ -1,483 +0,0 @@
1
- #!/usr/bin/env python
2
- '''Copyright The Microsoft DeepSpeed Team'''
3
-
4
- # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
5
- # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
6
- # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
7
- # application.
8
- #
9
- # example: python zero_to_fp32.py . pytorch_model.bin
10
-
11
- import argparse
12
- import torch
13
- import glob
14
- import math
15
- import os
16
- import re
17
- from collections import OrderedDict
18
-
19
- # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
20
- # DeepSpeed data structures it has to be available in the current python environment.
21
- from deepspeed.utils import logger
22
- from deepspeed.checkpoint.constants import (DS_VERSION,
23
- OPTIMIZER_STATE_DICT,
24
- SINGLE_PARTITION_OF_FP32_GROUPS,
25
- FP32_FLAT_GROUPS,
26
- ZERO_STAGE,
27
- PARTITION_COUNT,
28
- PARAM_SHAPES,
29
- BUFFER_NAMES)
30
-
31
- debug = 0
32
-
33
- # load to cpu
34
- device = torch.device('cpu')
35
-
36
-
37
- def atoi(text):
38
- return int(text) if text.isdigit() else text
39
-
40
-
41
- def natural_keys(text):
42
- '''
43
- alist.sort(key=natural_keys) sorts in human order
44
- http://nedbatchelder.com/blog/200712/human_sorting.html
45
- (See Toothy's implementation in the comments)
46
- '''
47
- return [atoi(c) for c in re.split(r'(\d+)', text)]
48
-
49
-
50
- def get_model_state_file(checkpoint_dir, zero_stage):
51
- if not os.path.isdir(checkpoint_dir):
52
- raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
53
-
54
- # there should be only one file
55
- if zero_stage == 2:
56
- file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
57
- elif zero_stage == 3:
58
- file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
59
-
60
- if not os.path.exists(file):
61
- raise FileNotFoundError(f"can't find model states file at '{file}'")
62
-
63
- return file
64
-
65
-
66
- def get_optim_files(checkpoint_dir):
67
- # XXX: need to test that this simple glob rule works for multi-node setup too
68
- optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
69
- "*_optim_states.pt")),
70
- key=natural_keys)
71
-
72
- if len(optim_files) == 0:
73
- raise FileNotFoundError(
74
- f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
75
-
76
- return optim_files
77
-
78
-
79
- def parse_model_state(file):
80
- state_dict = torch.load(file, map_location=device)
81
-
82
- if BUFFER_NAMES not in state_dict:
83
- raise ValueError(f"{file} is not a model state checkpoint")
84
- buffer_names = state_dict[BUFFER_NAMES]
85
- if debug:
86
- print("Found buffers:", buffer_names)
87
-
88
- # recover just the buffers while restoring them to fp32 if they were saved in fp16
89
- buffers = {
90
- k: v.float()
91
- for k,
92
- v in state_dict["module"].items() if k in buffer_names
93
- }
94
- param_shapes = state_dict[PARAM_SHAPES]
95
-
96
- ds_version = state_dict.get(DS_VERSION, None)
97
-
98
- return buffers, param_shapes, ds_version
99
-
100
-
101
- def parse_optim_states(files, ds_checkpoint_dir):
102
-
103
- total_files = len(files)
104
- state_dicts = []
105
- for f in files:
106
- state_dicts.append(torch.load(f, map_location=device))
107
-
108
- if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
109
- raise ValueError(f"{files[0]} is not a zero checkpoint")
110
- zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
111
- world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
112
-
113
- # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
114
- # parameters can be different from data parallelism for non-expert parameters. So we can just
115
- # use the max of the partition_count to get the dp world_size.
116
-
117
- if type(world_size) is list:
118
- world_size = max(world_size)
119
-
120
- if world_size != total_files:
121
- raise ValueError(
122
- f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
123
- "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
124
- )
125
-
126
- # the groups are named differently in each stage
127
- if zero_stage == 2:
128
- fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
129
- elif zero_stage == 3:
130
- fp32_groups_key = FP32_FLAT_GROUPS
131
- else:
132
- raise ValueError(f"unknown zero stage {zero_stage}")
133
-
134
- if zero_stage == 2:
135
- fp32_flat_groups = [
136
- state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
137
- for i in range(len(state_dicts))
138
- ]
139
- elif zero_stage == 3:
140
- # if there is more than one param group, there will be multiple flattened tensors - one
141
- # flattened tensor per group - for simplicity merge them into a single tensor
142
- #
143
- # XXX: could make the script more memory efficient for when there are multiple groups - it
144
- # will require matching the sub-lists of param_shapes for each param group flattened tensor
145
-
146
- fp32_flat_groups = [
147
- torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
148
- 0) for i in range(len(state_dicts))
149
- ]
150
-
151
- return zero_stage, world_size, fp32_flat_groups
152
-
153
-
154
- def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
155
- """
156
- Returns fp32 state_dict reconstructed from ds checkpoint
157
-
158
- Args:
159
- - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
160
-
161
- """
162
- print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
163
-
164
- optim_files = get_optim_files(ds_checkpoint_dir)
165
- zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
166
- print(
167
- f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
168
-
169
- model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
170
- buffers, param_shapes, ds_version = parse_model_state(model_file)
171
- print(f'Parsing checkpoint created by deepspeed=={ds_version}')
172
-
173
- if zero_stage == 2:
174
- return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
175
- param_shapes,
176
- fp32_flat_groups,
177
- buffers)
178
- elif zero_stage == 3:
179
- return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
180
- param_shapes,
181
- fp32_flat_groups,
182
- buffers)
183
-
184
-
185
- def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
186
- param_shapes,
187
- fp32_flat_groups,
188
- buffers):
189
-
190
- # Reconstruction protocol:
191
- #
192
- # XXX: document this
193
-
194
- if debug:
195
- for i in range(world_size):
196
- for j in range(len(fp32_flat_groups[0])):
197
- print(
198
- f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
199
-
200
- # XXX: memory usage doubles here (zero2)
201
- num_param_groups = len(fp32_flat_groups[0])
202
- merged_single_partition_of_fp32_groups = []
203
- for i in range(num_param_groups):
204
- merged_partitions = [sd[i] for sd in fp32_flat_groups]
205
- full_single_fp32_vector = torch.cat(merged_partitions, 0)
206
- merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
207
- avail_numel = sum([
208
- full_single_fp32_vector.numel()
209
- for full_single_fp32_vector in merged_single_partition_of_fp32_groups
210
- ])
211
-
212
- if debug:
213
- wanted_params = sum([len(shapes) for shapes in param_shapes])
214
- wanted_numel = sum(
215
- [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
216
- # not asserting if there is a mismatch due to possible padding
217
- print(f"Have {avail_numel} numels to process.")
218
- print(f"Need {wanted_numel} numels in {wanted_params} params.")
219
-
220
- state_dict = OrderedDict()
221
-
222
- # buffers
223
- state_dict.update(buffers)
224
- if debug:
225
- print(f"added {len(buffers)} buffers")
226
-
227
- # params
228
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
229
- # out-of-core computing solution
230
- total_numel = 0
231
- total_params = 0
232
- for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
233
- offset = 0
234
- avail_numel = full_single_fp32_vector.numel()
235
- for name, shape in shapes.items():
236
-
237
- unpartitioned_numel = shape.numel()
238
- total_numel += unpartitioned_numel
239
- total_params += 1
240
-
241
- if debug:
242
- print(
243
- f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
244
- )
245
- state_dict[name] = full_single_fp32_vector.narrow(
246
- 0,
247
- offset,
248
- unpartitioned_numel).view(shape)
249
- offset += unpartitioned_numel
250
-
251
- # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
252
- # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
253
- # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
254
- # live optimizer object, so we are checking that the numbers are within the right range
255
- align_to = 2 * world_size
256
-
257
- def zero2_align(x):
258
- return align_to * math.ceil(x / align_to)
259
-
260
- if debug:
261
- print(f"original offset={offset}, avail_numel={avail_numel}")
262
-
263
- offset = zero2_align(offset)
264
- avail_numel = zero2_align(avail_numel)
265
-
266
- if debug:
267
- print(f"aligned offset={offset}, avail_numel={avail_numel}")
268
-
269
- # Sanity check
270
- if offset != avail_numel:
271
- raise ValueError(
272
- f"consumed {offset} numels out of {avail_numel} - something is wrong")
273
-
274
- print(
275
- f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
276
- )
277
-
278
- return state_dict
279
-
280
-
281
- def zero3_partitioned_param_info(unpartitioned_numel, world_size):
282
- remainder = unpartitioned_numel % world_size
283
- padding_numel = (world_size - remainder) if remainder else 0
284
- partitioned_numel = math.ceil(unpartitioned_numel / world_size)
285
- return partitioned_numel, padding_numel
286
-
287
-
288
- def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
289
- param_shapes,
290
- fp32_flat_groups,
291
- buffers):
292
-
293
- # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
294
- # param, re-consolidating each param, while dealing with padding if any
295
-
296
- avail_numel = fp32_flat_groups[0].numel() * world_size
297
- # merge list of dicts, preserving order
298
- param_shapes = {k: v for d in param_shapes for k, v in d.items()}
299
-
300
- if debug:
301
- for i in range(world_size):
302
- print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
303
-
304
- wanted_params = len(param_shapes)
305
- wanted_numel = sum(shape.numel() for shape in param_shapes.values())
306
- # not asserting if there is a mismatch due to possible padding
307
- print(f"Have {avail_numel} numels to process.")
308
- print(f"Need {wanted_numel} numels in {wanted_params} params.")
309
-
310
- state_dict = OrderedDict()
311
-
312
- # buffers
313
- state_dict.update(buffers)
314
- if debug:
315
- print(f"added {len(buffers)} buffers")
316
-
317
- # params
318
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
319
- # out-of-core computing solution
320
- offset = 0
321
- total_numel = 0
322
- total_params = 0
323
- for name, shape in param_shapes.items():
324
-
325
- unpartitioned_numel = shape.numel()
326
- total_numel += unpartitioned_numel
327
- total_params += 1
328
-
329
- partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
330
-
331
- if debug:
332
- print(
333
- f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
334
- )
335
-
336
- # XXX: memory usage doubles here
337
- state_dict[name] = torch.cat(
338
- tuple(fp32_flat_groups[i].narrow(0,
339
- offset,
340
- partitioned_numel)
341
- for i in range(world_size)),
342
- 0).narrow(0,
343
- 0,
344
- unpartitioned_numel).view(shape)
345
- offset += partitioned_numel
346
-
347
- offset *= world_size
348
-
349
- # Sanity check
350
- if offset != avail_numel:
351
- raise ValueError(
352
- f"consumed {offset} numels out of {avail_numel} - something is wrong")
353
-
354
- print(
355
- f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
356
- )
357
-
358
- return state_dict
359
-
360
-
361
- def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
362
- """
363
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
364
- ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
365
- via a model hub.
366
-
367
- Args:
368
- - ``checkpoint_dir``: path to the desired checkpoint folder
369
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
370
-
371
- Returns:
372
- - pytorch ``state_dict``
373
-
374
- Note: this approach may not work if your application doesn't have sufficient free CPU memory and
375
- you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
376
- the checkpoint.
377
-
378
- A typical usage might be ::
379
-
380
- from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
381
- # do the training and checkpoint saving
382
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
383
- model = model.cpu() # move to cpu
384
- model.load_state_dict(state_dict)
385
- # submit to model hub or save the model to share with others
386
-
387
- In this example the ``model`` will no longer be usable in the deepspeed context of the same
388
- application. i.e. you will need to re-initialize the deepspeed engine, since
389
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
390
-
391
- If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
392
-
393
- """
394
- if tag is None:
395
- latest_path = os.path.join(checkpoint_dir, 'latest')
396
- if os.path.isfile(latest_path):
397
- with open(latest_path, 'r') as fd:
398
- tag = fd.read().strip()
399
- else:
400
- raise ValueError(f"Unable to find 'latest' file at {latest_path}")
401
-
402
- ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
403
-
404
- if not os.path.isdir(ds_checkpoint_dir):
405
- raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
406
-
407
- return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
408
-
409
-
410
- def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
411
- """
412
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
413
- loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
414
-
415
- Args:
416
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
417
- - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
418
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
419
- """
420
-
421
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
422
- print(f"Saving fp32 state dict to {output_file}")
423
- torch.save(state_dict, output_file)
424
-
425
-
426
- def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
427
- """
428
- 1. Put the provided model to cpu
429
- 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
430
- 3. Load it into the provided model
431
-
432
- Args:
433
- - ``model``: the model object to update
434
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
435
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
436
-
437
- Returns:
438
- - ``model`: modified model
439
-
440
- Make sure you have plenty of CPU memory available before you call this function. If you don't
441
- have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
442
- conveniently placed for you in the checkpoint folder.
443
-
444
- A typical usage might be ::
445
-
446
- from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
447
- model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
448
- # submit to model hub or save the model to share with others
449
-
450
- Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
451
- of the same application. i.e. you will need to re-initialize the deepspeed engine, since
452
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
453
-
454
- """
455
- logger.info(f"Extracting fp32 weights")
456
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
457
-
458
- logger.info(f"Overwriting model with fp32 weights")
459
- model = model.cpu()
460
- model.load_state_dict(state_dict, strict=False)
461
-
462
- return model
463
-
464
-
465
- if __name__ == "__main__":
466
-
467
- parser = argparse.ArgumentParser()
468
- parser.add_argument(
469
- "checkpoint_dir",
470
- type=str,
471
- help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
472
- parser.add_argument(
473
- "output_file",
474
- type=str,
475
- help=
476
- "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
477
- )
478
- parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
479
- args = parser.parse_args()
480
-
481
- debug = args.debug
482
-
483
- convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
InstructScore_Tok/special_tokens_map.json DELETED
@@ -1 +0,0 @@
1
- {}
 
 
InstructScore_Tok/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
InstructScore_Tok/tokenizer_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "bos_token": "",
3
- "clean_up_tokenization_spaces": false,
4
- "eos_token": "",
5
- "model_max_length": 1000000000000000019884624838656,
6
- "special_tokens_map_file": "/mnt/data3/wendaxu/.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/special_tokens_map.json",
7
- "tokenizer_class": "LlamaTokenizer",
8
- "unk_token": ""
9
- }
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -8,11 +8,13 @@ An amazing explanation metric (diagnostic report) for text generation evaluation
8
 
9
  First step, you may download all required dependencies through: pip3 install -r requirements.txt
10
 
11
- # Metric Card for InstructScore
12
- ![alt text](https://huggingface.co/xu1998hz/InstructScore/blob/main/figs/InstructScore_teaser.jpg)
13
 
14
  To run our metric, you only need five lines
15
 
 
 
16
  ````
17
  ```
18
  from InstructScore import *
@@ -23,6 +25,6 @@ batch_outputs, scores_ls = scorer.score(refs, outs)
23
  ```
24
  ````
25
 
26
- ![Overview](https://huggingface.co/xu1998hz/InstructScore/blob/main/figs/InstructScore.jpg)
27
-
28
 
 
 
8
 
9
  First step, you may download all required dependencies through: pip3 install -r requirements.txt
10
 
11
+ <!-- # Metric Card for InstructScore
12
+ ![alt text](https://huggingface.co/xu1998hz/InstructScore/blob/main/figs/InstructScore_teaser.jpg) -->
13
 
14
  To run our metric, you only need five lines
15
 
16
+ Please visit our github: https://github.com/xu1998hz/SEScore3/
17
+
18
  ````
19
  ```
20
  from InstructScore import *
 
25
  ```
26
  ````
27
 
28
+ <!-- ![Overview](https://huggingface.co/xu1998hz/InstructScore/blob/main/figs/InstructScore.jpg)
 
29
 
30
+ -->