Trace2333 commited on
Commit
c700ce7
0 Parent(s):

initial commit

Browse files
Files changed (9) hide show
  1. build_openprompt.py +46 -0
  2. data/1k.csv +0 -0
  3. gpt2_generation.py +453 -0
  4. rouge/README.md +161 -0
  5. rouge/app.py +6 -0
  6. rouge/requirements.txt +4 -0
  7. rouge/rouge.py +158 -0
  8. sft.py +92 -0
  9. utils.py +59 -0
build_openprompt.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import random
3
+ import json
4
+ import numpy as np
5
+
6
+ from sklearn.model_selection import ShuffleSplit
7
+
8
+ samples = {
9
+ "x": [],
10
+ "y": [],
11
+ }
12
+ little = False
13
+ all_loaded_sample = 500000
14
+ # 二十万条
15
+ with open("./data/prompts.csv") as f:
16
+ csv_reader = csv.DictReader(f)
17
+ for row_number, row in enumerate(csv_reader):
18
+ # if row_number == random.randint(0, 1000):
19
+ # break
20
+ if little:
21
+ if row_number > 100:
22
+ break
23
+ if row_number > all_loaded_sample:
24
+ break
25
+
26
+ datum = row
27
+ modifiers = json.loads(datum['raw_data'])['modifiers']
28
+ n = random.randint(1, 11)
29
+ if len(modifiers) < 3:
30
+ continue
31
+ label = ",".join(modifiers) if len(modifiers) > 1 else modifiers[0]
32
+ if 0<n and n<=6:
33
+ x = modifiers[0]
34
+ elif n>6 and n<=9:
35
+ x = ",".join(modifiers[:2])
36
+ else:
37
+ x = ",".join(modifiers[:3])
38
+ # 小文本到大文本,因此x更小,同时x按照6:3:1的比例分配
39
+
40
+ samples["x"].append(x)
41
+ samples["y"].append(label)
42
+
43
+
44
+ with open("./data/dataset_openprompt.json", "w") as f:
45
+ json.dump(samples, f, indent=4, ensure_ascii=False)
46
+ print("*"*40, "save train done.", "with little" if little else "", "*"*40)
data/1k.csv ADDED
The diff for this file is too large to render. See raw diff
 
gpt2_generation.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
4
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """ Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
18
+ """
19
+
20
+
21
+ import argparse
22
+ import inspect
23
+ import time
24
+ import logging
25
+ from typing import Tuple
26
+
27
+ import torch
28
+ from accelerate import PartialState
29
+ from accelerate.utils import set_seed
30
+
31
+ from transformers import (
32
+ AutoTokenizer,
33
+ BloomForCausalLM,
34
+ BloomTokenizerFast,
35
+ CTRLLMHeadModel,
36
+ CTRLTokenizer,
37
+ GenerationMixin,
38
+ GPT2LMHeadModel,
39
+ GPT2Tokenizer,
40
+ GPTJForCausalLM,
41
+ LlamaForCausalLM,
42
+ LlamaTokenizer,
43
+ OpenAIGPTLMHeadModel,
44
+ OpenAIGPTTokenizer,
45
+ OPTForCausalLM,
46
+ TransfoXLLMHeadModel,
47
+ TransfoXLTokenizer,
48
+ XLMTokenizer,
49
+ XLMWithLMHeadModel,
50
+ XLNetLMHeadModel,
51
+ XLNetTokenizer,
52
+ )
53
+ from transformers.modeling_outputs import CausalLMOutputWithPast
54
+
55
+
56
+ logging.basicConfig(
57
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
58
+ datefmt="%m/%d/%Y %H:%M:%S",
59
+ level=logging.INFO,
60
+ )
61
+ logger = logging.getLogger(__name__)
62
+
63
+ MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
64
+
65
+ MODEL_CLASSES = {
66
+ "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
67
+ "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
68
+ "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
69
+ "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
70
+ "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
71
+ "xlm": (XLMWithLMHeadModel, XLMTokenizer),
72
+ "gptj": (GPTJForCausalLM, AutoTokenizer),
73
+ "bloom": (BloomForCausalLM, BloomTokenizerFast),
74
+ "llama": (LlamaForCausalLM, LlamaTokenizer),
75
+ "opt": (OPTForCausalLM, GPT2Tokenizer),
76
+ }
77
+
78
+ # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
79
+ # in https://github.com/rusiaaman/XLNet-gen#methodology
80
+ # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
81
+ PREFIX = """In 1991, the remains of Russian Tsar Nicholas II and his family
82
+ (except for Alexei and Maria) are discovered.
83
+ The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
84
+ remainder of the story. 1883 Western Siberia,
85
+ a young Grigori Rasputin is asked by his father and a group of men to perform magic.
86
+ Rasputin has a vision and denounces one of the men as a horse thief. Although his
87
+ father initially slaps him for making such an accusation, Rasputin watches as the
88
+ man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
89
+ the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
90
+ with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
91
+
92
+
93
+ #
94
+ # Functions to prepare models' input
95
+ #
96
+
97
+
98
+ def prepare_ctrl_input(args, _, tokenizer, prompt_text):
99
+ if args.temperature > 0.7:
100
+ logger.info("CTRL typically works better with lower temperatures (and lower top_k).")
101
+
102
+ encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
103
+ if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
104
+ logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
105
+ return prompt_text
106
+
107
+
108
+ def prepare_xlm_input(args, model, tokenizer, prompt_text):
109
+ # kwargs = {"language": None, "mask_token_id": None}
110
+
111
+ # Set the language
112
+ use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb
113
+ if hasattr(model.config, "lang2id") and use_lang_emb:
114
+ available_languages = model.config.lang2id.keys()
115
+ if args.xlm_language in available_languages:
116
+ language = args.xlm_language
117
+ else:
118
+ language = None
119
+ while language not in available_languages:
120
+ language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
121
+
122
+ model.config.lang_id = model.config.lang2id[language]
123
+ # kwargs["language"] = tokenizer.lang2id[language]
124
+
125
+ # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
126
+ # XLM masked-language modeling (MLM) models need masked token
127
+ # is_xlm_mlm = "mlm" in args.model_name_or_path
128
+ # if is_xlm_mlm:
129
+ # kwargs["mask_token_id"] = tokenizer.mask_token_id
130
+
131
+ return prompt_text
132
+
133
+
134
+ def prepare_xlnet_input(args, _, tokenizer, prompt_text):
135
+ prefix = args.prefix if args.prefix else args.padding_text if args.padding_text else PREFIX
136
+ prompt_text = prefix + prompt_text
137
+ return prompt_text
138
+
139
+
140
+ def prepare_transfoxl_input(args, _, tokenizer, prompt_text):
141
+ prefix = args.prefix if args.prefix else args.padding_text if args.padding_text else PREFIX
142
+ prompt_text = prefix + prompt_text
143
+ return prompt_text
144
+
145
+
146
+ PREPROCESSING_FUNCTIONS = {
147
+ "ctrl": prepare_ctrl_input,
148
+ "xlm": prepare_xlm_input,
149
+ "xlnet": prepare_xlnet_input,
150
+ "transfo-xl": prepare_transfoxl_input,
151
+ }
152
+
153
+
154
+ def adjust_length_to_model(length, max_sequence_length):
155
+ if length < 0 and max_sequence_length > 0:
156
+ length = max_sequence_length
157
+ elif 0 < max_sequence_length < length:
158
+ length = max_sequence_length # No generation bigger than model size
159
+ elif length < 0:
160
+ length = MAX_LENGTH # avoid infinite loop
161
+ return length
162
+
163
+
164
+ def sparse_model_config(model_config):
165
+ embedding_size = None
166
+ if hasattr(model_config, "hidden_size"):
167
+ embedding_size = model_config.hidden_size
168
+ elif hasattr(model_config, "n_embed"):
169
+ embedding_size = model_config.n_embed
170
+ elif hasattr(model_config, "n_embd"):
171
+ embedding_size = model_config.n_embd
172
+
173
+ num_head = None
174
+ if hasattr(model_config, "num_attention_heads"):
175
+ num_head = model_config.num_attention_heads
176
+ elif hasattr(model_config, "n_head"):
177
+ num_head = model_config.n_head
178
+
179
+ if embedding_size is None or num_head is None or num_head == 0:
180
+ raise ValueError("Check the model config")
181
+
182
+ num_embedding_size_per_head = int(embedding_size / num_head)
183
+ if hasattr(model_config, "n_layer"):
184
+ num_layer = model_config.n_layer
185
+ elif hasattr(model_config, "num_hidden_layers"):
186
+ num_layer = model_config.num_hidden_layers
187
+ else:
188
+ raise ValueError("Number of hidden layers couldn't be determined from the model config")
189
+
190
+ return num_layer, num_head, num_embedding_size_per_head
191
+
192
+
193
+ def generate_past_key_values(model, batch_size, seq_len):
194
+ num_block_layers, num_attention_heads, num_embedding_size_per_head = sparse_model_config(model.config)
195
+ if model.config.model_type == "bloom":
196
+ past_key_values = tuple(
197
+ (
198
+ torch.empty(int(num_attention_heads * batch_size), num_embedding_size_per_head, seq_len)
199
+ .to(model.dtype)
200
+ .to(model.device),
201
+ torch.empty(int(num_attention_heads * batch_size), seq_len, num_embedding_size_per_head)
202
+ .to(model.dtype)
203
+ .to(model.device),
204
+ )
205
+ for _ in range(num_block_layers)
206
+ )
207
+ else:
208
+ past_key_values = tuple(
209
+ (
210
+ torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
211
+ .to(model.dtype)
212
+ .to(model.device),
213
+ torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
214
+ .to(model.dtype)
215
+ .to(model.device),
216
+ )
217
+ for _ in range(num_block_layers)
218
+ )
219
+ return past_key_values
220
+
221
+
222
+ def prepare_jit_inputs(inputs, model, tokenizer):
223
+ batch_size = len(inputs)
224
+ dummy_input = tokenizer.batch_encode_plus(inputs, return_tensors="pt")
225
+ dummy_input = dummy_input.to(model.device)
226
+ if model.config.use_cache:
227
+ dummy_input["past_key_values"] = generate_past_key_values(model, batch_size, 1)
228
+ dummy_input["attention_mask"] = torch.cat(
229
+ [
230
+ torch.zeros(dummy_input["attention_mask"].shape[0], 1)
231
+ .to(dummy_input["attention_mask"].dtype)
232
+ .to(model.device),
233
+ dummy_input["attention_mask"],
234
+ ],
235
+ -1,
236
+ )
237
+ return dummy_input
238
+
239
+
240
+ class _ModelFallbackWrapper(GenerationMixin):
241
+ __slots__ = ("_optimized", "_default")
242
+
243
+ def __init__(self, optimized, default):
244
+ self._optimized = optimized
245
+ self._default = default
246
+
247
+ def __call__(self, *args, **kwargs):
248
+ if kwargs["past_key_values"] is None and self._default.config.use_cache:
249
+ kwargs["past_key_values"] = generate_past_key_values(self._default, kwargs["input_ids"].shape[0], 0)
250
+ kwargs.pop("position_ids", None)
251
+ for k in list(kwargs.keys()):
252
+ if kwargs[k] is None or isinstance(kwargs[k], bool):
253
+ kwargs.pop(k)
254
+ outputs = self._optimized(**kwargs)
255
+ lm_logits = outputs[0]
256
+ past_key_values = outputs[1]
257
+ fixed_output = CausalLMOutputWithPast(
258
+ loss=None,
259
+ logits=lm_logits,
260
+ past_key_values=past_key_values,
261
+ hidden_states=None,
262
+ attentions=None,
263
+ )
264
+ return fixed_output
265
+
266
+ def __getattr__(self, item):
267
+ return getattr(self._default, item)
268
+
269
+ def prepare_inputs_for_generation(
270
+ self, input_ids, past_key_values=None, inputs_embeds=None, use_cache=None, **kwargs
271
+ ):
272
+ return self._default.prepare_inputs_for_generation(
273
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, **kwargs
274
+ )
275
+
276
+ def _reorder_cache(
277
+ self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
278
+ ) -> Tuple[Tuple[torch.Tensor]]:
279
+ """
280
+ This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
281
+ [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
282
+ beam_idx at every generation step.
283
+ """
284
+ return self._default._reorder_cache(past_key_values, beam_idx)
285
+
286
+
287
+ def main():
288
+ parser = argparse.ArgumentParser()
289
+ parser.add_argument(
290
+ "--model_type",
291
+ default="gpt2",
292
+ type=str,
293
+ help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
294
+ )
295
+ parser.add_argument(
296
+ "--model_name_or_path",
297
+ default="./output/gpt2_openprpmpt/checkpoint-218500",
298
+ type=str,
299
+ help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
300
+ )
301
+
302
+ parser.add_argument("--prompt", type=str, default="")
303
+ parser.add_argument("--length", type=int, default=60)
304
+ parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
305
+
306
+ parser.add_argument(
307
+ "--temperature",
308
+ type=float,
309
+ default=1.0,
310
+ help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
311
+ )
312
+ parser.add_argument(
313
+ "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
314
+ )
315
+ parser.add_argument("--k", type=int, default=3)
316
+ parser.add_argument("--p", type=float, default=0.9)
317
+
318
+ parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
319
+ parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
320
+ parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
321
+
322
+ parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
323
+ parser.add_argument(
324
+ "--use_cpu",
325
+ action="store_true",
326
+ help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
327
+ )
328
+ parser.add_argument("--num_return_sequences", type=int, default=4, help="The number of samples to generate.")
329
+ parser.add_argument(
330
+ "--fp16",
331
+ action="store_true",
332
+ help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
333
+ )
334
+ parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
335
+ args = parser.parse_args()
336
+
337
+ # Initialize the distributed state.
338
+ distributed_state = PartialState(cpu=args.use_cpu)
339
+
340
+ logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
341
+
342
+ if args.seed is not None:
343
+ set_seed(args.seed)
344
+
345
+ # Initialize the model and tokenizer
346
+ try:
347
+ args.model_type = args.model_type.lower()
348
+ model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
349
+ except KeyError:
350
+ raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
351
+
352
+ tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, padding_side='left')
353
+ if tokenizer.pad_token is None:
354
+ tokenizer.pad_token = tokenizer.eos_token
355
+ tokenizer.mask_token = tokenizer.eos_token
356
+ model = model_class.from_pretrained(args.model_name_or_path)
357
+
358
+ # Set the model to the right device
359
+ model.to(distributed_state.device)
360
+
361
+ if args.fp16:
362
+ model.half()
363
+ max_seq_length = getattr(model.config, "max_position_embeddings", 0)
364
+ args.length = adjust_length_to_model(args.length, max_sequence_length=max_seq_length)
365
+ logger.info(args)
366
+
367
+ prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
368
+
369
+ # Different models need different input formatting and/or extra arguments
370
+ requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys()
371
+ if requires_preprocessing:
372
+ prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
373
+ preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)
374
+
375
+ if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
376
+ tokenizer_kwargs = {"add_space_before_punct_symbol": True}
377
+ else:
378
+ tokenizer_kwargs = {}
379
+
380
+ encoded_prompt = tokenizer.encode(
381
+ preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
382
+ )
383
+ else:
384
+ prefix = args.prefix if args.prefix else args.padding_text
385
+ encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
386
+ encoded_prompt = encoded_prompt.to(distributed_state.device)
387
+
388
+ if encoded_prompt.size()[-1] == 0:
389
+ input_ids = None
390
+ else:
391
+ input_ids = encoded_prompt
392
+
393
+ if args.jit:
394
+ jit_input_texts = ["enable jit"]
395
+ jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
396
+ torch._C._jit_set_texpr_fuser_enabled(False)
397
+ model.config.return_dict = False
398
+ if hasattr(model, "forward"):
399
+ sig = inspect.signature(model.forward)
400
+ else:
401
+ sig = inspect.signature(model.__call__)
402
+ jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
403
+ traced_model = torch.jit.trace(model, jit_inputs, strict=False)
404
+ traced_model = torch.jit.freeze(traced_model.eval())
405
+ traced_model(*jit_inputs)
406
+ traced_model(*jit_inputs)
407
+
408
+ model = _ModelFallbackWrapper(traced_model, model)
409
+ t1 = time.time()
410
+ output_sequences = model.generate(
411
+ input_ids=input_ids,
412
+ max_length=args.length + len(encoded_prompt[0]),
413
+ temperature=args.temperature,
414
+ top_k=args.k,
415
+ top_p=args.p,
416
+ repetition_penalty=args.repetition_penalty,
417
+ do_sample=True,
418
+ num_return_sequences=args.num_return_sequences,
419
+ )
420
+
421
+ # Remove the batch dimension when returning multiple sequences
422
+ if len(output_sequences.shape) > 2:
423
+ output_sequences.squeeze_()
424
+
425
+ generated_sequences = []
426
+
427
+ for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
428
+ print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
429
+ generated_sequence = generated_sequence.tolist()
430
+
431
+ # Decode text
432
+ text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
433
+
434
+ # Remove all text after the stop token
435
+ text = text[: text.find(args.stop_token) if args.stop_token else None]
436
+
437
+ # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
438
+ total_sequence = (
439
+ prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
440
+ )
441
+
442
+ generated_sequences.append(total_sequence)
443
+ print(total_sequence)
444
+
445
+ t2 = time.time()
446
+ print("*"*60)
447
+ print(f"Time cost: {t2-t1}")
448
+
449
+ return generated_sequences
450
+
451
+
452
+ if __name__ == "__main__":
453
+ main()
rouge/README.md ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ROUGE
3
+ emoji: 🤗
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 3.19.1
8
+ app_file: app.py
9
+ pinned: false
10
+ tags:
11
+ - evaluate
12
+ - metric
13
+ description: >-
14
+ ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for
15
+ evaluating automatic summarization and machine translation software in natural language processing.
16
+ The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.
17
+
18
+ Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.
19
+
20
+ This metrics is a wrapper around Google Research reimplementation of ROUGE:
21
+ https://github.com/google-research/google-research/tree/master/rouge
22
+ ---
23
+
24
+ # Metric Card for ROUGE
25
+
26
+ ## Metric Description
27
+ ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for evaluating automatic summarization and machine translation software in natural language processing. The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.
28
+
29
+ Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.
30
+
31
+ This metrics is a wrapper around the [Google Research reimplementation of ROUGE](https://github.com/google-research/google-research/tree/master/rouge)
32
+
33
+ ## How to Use
34
+ At minimum, this metric takes as input a list of predictions and a list of references:
35
+ ```python
36
+ >>> rouge = evaluate.load('rouge')
37
+ >>> predictions = ["hello there", "general kenobi"]
38
+ >>> references = ["hello there", "general kenobi"]
39
+ >>> results = rouge.compute(predictions=predictions,
40
+ ... references=references)
41
+ >>> print(results)
42
+ {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
43
+ ```
44
+
45
+ One can also pass a custom tokenizer which is especially useful for non-latin languages.
46
+ ```python
47
+ >>> results = rouge.compute(predictions=predictions,
48
+ ... references=references,
49
+ tokenizer=lambda x: x.split())
50
+ >>> print(results)
51
+ {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
52
+ ```
53
+
54
+ It can also deal with lists of references for each predictions:
55
+ ```python
56
+ >>> rouge = evaluate.load('rouge')
57
+ >>> predictions = ["hello there", "general kenobi"]
58
+ >>> references = [["hello", "there"], ["general kenobi", "general yoda"]]
59
+ >>> results = rouge.compute(predictions=predictions,
60
+ ... references=references)
61
+ >>> print(results)
62
+ {'rouge1': 0.8333, 'rouge2': 0.5, 'rougeL': 0.8333, 'rougeLsum': 0.8333}```
63
+ ```
64
+
65
+ ### Inputs
66
+ - **predictions** (`list`): list of predictions to score. Each prediction
67
+ should be a string with tokens separated by spaces.
68
+ - **references** (`list` or `list[list]`): list of reference for each prediction or a list of several references per prediction. Each
69
+ reference should be a string with tokens separated by spaces.
70
+ - **rouge_types** (`list`): A list of rouge types to calculate. Defaults to `['rouge1', 'rouge2', 'rougeL', 'rougeLsum']`.
71
+ - Valid rouge types:
72
+ - `"rouge1"`: unigram (1-gram) based scoring
73
+ - `"rouge2"`: bigram (2-gram) based scoring
74
+ - `"rougeL"`: Longest common subsequence based scoring.
75
+ - `"rougeLSum"`: splits text using `"\n"`
76
+ - See [here](https://github.com/huggingface/datasets/issues/617) for more information
77
+ - **use_aggregator** (`boolean`): If True, returns aggregates. Defaults to `True`.
78
+ - **use_stemmer** (`boolean`): If `True`, uses Porter stemmer to strip word suffixes. Defaults to `False`.
79
+
80
+ ### Output Values
81
+ The output is a dictionary with one entry for each rouge type in the input list `rouge_types`. If `use_aggregator=False`, each dictionary entry is a list of scores, with one score for each sentence. E.g. if `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=False`, the output is:
82
+
83
+ ```python
84
+ {'rouge1': [0.6666666666666666, 1.0], 'rouge2': [0.0, 1.0]}
85
+ ```
86
+
87
+ If `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=True`, the output is of the following format:
88
+ ```python
89
+ {'rouge1': 1.0, 'rouge2': 1.0}
90
+ ```
91
+
92
+ The ROUGE values are in the range of 0 to 1.
93
+
94
+
95
+ #### Values from Popular Papers
96
+
97
+
98
+ ### Examples
99
+ An example without aggregation:
100
+ ```python
101
+ >>> rouge = evaluate.load('rouge')
102
+ >>> predictions = ["hello goodbye", "ankh morpork"]
103
+ >>> references = ["goodbye", "general kenobi"]
104
+ >>> results = rouge.compute(predictions=predictions,
105
+ ... references=references,
106
+ ... use_aggregator=False)
107
+ >>> print(list(results.keys()))
108
+ ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
109
+ >>> print(results["rouge1"])
110
+ [0.5, 0.0]
111
+ ```
112
+
113
+ The same example, but with aggregation:
114
+ ```python
115
+ >>> rouge = evaluate.load('rouge')
116
+ >>> predictions = ["hello goodbye", "ankh morpork"]
117
+ >>> references = ["goodbye", "general kenobi"]
118
+ >>> results = rouge.compute(predictions=predictions,
119
+ ... references=references,
120
+ ... use_aggregator=True)
121
+ >>> print(list(results.keys()))
122
+ ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
123
+ >>> print(results["rouge1"])
124
+ 0.25
125
+ ```
126
+
127
+ The same example, but only calculating `rouge_1`:
128
+ ```python
129
+ >>> rouge = evaluate.load('rouge')
130
+ >>> predictions = ["hello goodbye", "ankh morpork"]
131
+ >>> references = ["goodbye", "general kenobi"]
132
+ >>> results = rouge.compute(predictions=predictions,
133
+ ... references=references,
134
+ ... rouge_types=['rouge_1'],
135
+ ... use_aggregator=True)
136
+ >>> print(list(results.keys()))
137
+ ['rouge1']
138
+ >>> print(results["rouge1"])
139
+ 0.25
140
+ ```
141
+
142
+ ## Limitations and Bias
143
+ See [Schluter (2017)](https://aclanthology.org/E17-2007/) for an in-depth discussion of many of ROUGE's limits.
144
+
145
+ ## Citation
146
+ ```bibtex
147
+ @inproceedings{lin-2004-rouge,
148
+ title = "{ROUGE}: A Package for Automatic Evaluation of Summaries",
149
+ author = "Lin, Chin-Yew",
150
+ booktitle = "Text Summarization Branches Out",
151
+ month = jul,
152
+ year = "2004",
153
+ address = "Barcelona, Spain",
154
+ publisher = "Association for Computational Linguistics",
155
+ url = "https://www.aclweb.org/anthology/W04-1013",
156
+ pages = "74--81",
157
+ }
158
+ ```
159
+
160
+ ## Further References
161
+ - This metrics is a wrapper around the [Google Research reimplementation of ROUGE](https://github.com/google-research/google-research/tree/master/rouge)
rouge/app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("rouge")
6
+ launch_gradio_widget(module)
rouge/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2
+ absl-py
3
+ nltk
4
+ rouge_score>=0.1.2
rouge/rouge.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Evaluate Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """ ROUGE metric from Google Research github repo. """
15
+
16
+ # The dependencies in https://github.com/google-research/google-research/blob/master/rouge/requirements.txt
17
+ import absl # Here to have a nice missing dependency error message early on
18
+ import datasets
19
+ import nltk # Here to have a nice missing dependency error message early on
20
+ import numpy # Here to have a nice missing dependency error message early on
21
+ import six # Here to have a nice missing dependency error message early on
22
+ from rouge_score import rouge_scorer, scoring
23
+
24
+ import evaluate
25
+
26
+
27
+ _CITATION = """\
28
+ @inproceedings{lin-2004-rouge,
29
+ title = "{ROUGE}: A Package for Automatic Evaluation of Summaries",
30
+ author = "Lin, Chin-Yew",
31
+ booktitle = "Text Summarization Branches Out",
32
+ month = jul,
33
+ year = "2004",
34
+ address = "Barcelona, Spain",
35
+ publisher = "Association for Computational Linguistics",
36
+ url = "https://www.aclweb.org/anthology/W04-1013",
37
+ pages = "74--81",
38
+ }
39
+ """
40
+
41
+ _DESCRIPTION = """\
42
+ ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for
43
+ evaluating automatic summarization and machine translation software in natural language processing.
44
+ The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.
45
+
46
+ Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.
47
+
48
+ This metrics is a wrapper around Google Research reimplementation of ROUGE:
49
+ https://github.com/google-research/google-research/tree/master/rouge
50
+ """
51
+
52
+ _KWARGS_DESCRIPTION = """
53
+ Calculates average rouge scores for a list of hypotheses and references
54
+ Args:
55
+ predictions: list of predictions to score. Each prediction
56
+ should be a string with tokens separated by spaces.
57
+ references: list of reference for each prediction. Each
58
+ reference should be a string with tokens separated by spaces.
59
+ rouge_types: A list of rouge types to calculate.
60
+ Valid names:
61
+ `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
62
+ `"rougeL"`: Longest common subsequence based scoring.
63
+ `"rougeLsum"`: rougeLsum splits text using `"\n"`.
64
+ See details in https://github.com/huggingface/datasets/issues/617
65
+ use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
66
+ use_aggregator: Return aggregates if this is set to True
67
+ Returns:
68
+ rouge1: rouge_1 (f1),
69
+ rouge2: rouge_2 (f1),
70
+ rougeL: rouge_l (f1),
71
+ rougeLsum: rouge_lsum (f1)
72
+ Examples:
73
+
74
+ >>> rouge = evaluate.load('rouge')
75
+ >>> predictions = ["hello there", "general kenobi"]
76
+ >>> references = ["hello there", "general kenobi"]
77
+ >>> results = rouge.compute(predictions=predictions, references=references)
78
+ >>> print(results)
79
+ {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
80
+ """
81
+
82
+
83
+ class Tokenizer:
84
+ """Helper class to wrap a callable into a class with a `tokenize` method as used by rouge-score."""
85
+
86
+ def __init__(self, tokenizer_func):
87
+ self.tokenizer_func = tokenizer_func
88
+
89
+ def tokenize(self, text):
90
+ return self.tokenizer_func(text)
91
+
92
+
93
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
94
+ class Rouge(evaluate.Metric):
95
+ def _info(self):
96
+ return evaluate.MetricInfo(
97
+ description=_DESCRIPTION,
98
+ citation=_CITATION,
99
+ inputs_description=_KWARGS_DESCRIPTION,
100
+ features=[
101
+ datasets.Features(
102
+ {
103
+ "predictions": datasets.Value("string", id="sequence"),
104
+ "references": datasets.Sequence(datasets.Value("string", id="sequence")),
105
+ }
106
+ ),
107
+ datasets.Features(
108
+ {
109
+ "predictions": datasets.Value("string", id="sequence"),
110
+ "references": datasets.Value("string", id="sequence"),
111
+ }
112
+ ),
113
+ ],
114
+ codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"],
115
+ reference_urls=[
116
+ "https://en.wikipedia.org/wiki/ROUGE_(metric)",
117
+ "https://github.com/google-research/google-research/tree/master/rouge",
118
+ ],
119
+ )
120
+
121
+ def _compute(
122
+ self, predictions, references, rouge_types=None, use_aggregator=True, use_stemmer=False, tokenizer=None
123
+ ):
124
+ if rouge_types is None:
125
+ rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
126
+
127
+ multi_ref = isinstance(references[0], list)
128
+
129
+ if tokenizer is not None:
130
+ tokenizer = Tokenizer(tokenizer)
131
+
132
+ scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer, tokenizer=tokenizer)
133
+ if use_aggregator:
134
+ aggregator = scoring.BootstrapAggregator()
135
+ else:
136
+ scores = []
137
+
138
+ for ref, pred in zip(references, predictions):
139
+ if multi_ref:
140
+ score = scorer.score_multi(ref, pred)
141
+ else:
142
+ score = scorer.score(ref, pred)
143
+ if use_aggregator:
144
+ aggregator.add_scores(score)
145
+ else:
146
+ scores.append(score)
147
+
148
+ if use_aggregator:
149
+ result = aggregator.aggregate()
150
+ for key in result:
151
+ result[key] = result[key].mid.fmeasure
152
+
153
+ else:
154
+ result = {}
155
+ for key in scores[0]:
156
+ result[key] = list(score[key].fmeasure for score in scores)
157
+
158
+ return result
sft.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import evaluate
3
+ import numpy as np
4
+
5
+ from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
6
+ from transformers import TrainingArguments, Trainer
7
+
8
+ from utils import (
9
+ get_dataset,
10
+ get_tok_and_model,
11
+ get_open_prompt_data,
12
+ get_dict_dataset,
13
+ get_advance_dataset,)
14
+
15
+ base_model = "distilgpt2"
16
+ tokenizer, model = get_tok_and_model(f"./models/{base_model}")
17
+ tokenizer.pad_token = tokenizer.eos_token
18
+ rouge = evaluate.load("rouge")
19
+
20
+ # train_data, test_data = get_open_prompt_data("./data")
21
+ # train_dataset, test_dataset = get_dataset(train_data, test_data)
22
+ dict_data = get_dict_dataset("./data")
23
+ dataset = get_advance_dataset(dict_data)
24
+ dataset = dataset.train_test_split(test_size=0.2)
25
+
26
+ def preprocess_function(examples):
27
+ x_inputs = [x for x in examples["x"]]
28
+ y_inputs = examples["y"]
29
+ model_inputs = tokenizer(x_inputs, max_length=128, truncation=True)
30
+
31
+ labels = tokenizer(text_target=y_inputs, max_length=128, truncation=True)
32
+
33
+ model_inputs["labels"] = model_inputs["input_ids"]
34
+ return model_inputs
35
+
36
+ def compute_metrics(eval_pred):
37
+ predictions, labels = eval_pred
38
+ decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
39
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
40
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
41
+
42
+ result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
43
+
44
+ prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
45
+ result["gen_len"] = np.mean(prediction_lens)
46
+
47
+ return {k: round(v, 4) for k, v in result.items()}
48
+
49
+ # data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
50
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
51
+
52
+
53
+ print("tokenize data...")
54
+ t1 = time.time()
55
+ tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["x", "y"])
56
+ t2 = time.time()
57
+ print(f"data tokenize done. process time : {t2 - t1}")
58
+
59
+
60
+ training_args = TrainingArguments(
61
+ output_dir=f"./output/{base_model}_openprpmpt",
62
+ evaluation_strategy="steps",
63
+ eval_steps=20000,
64
+ learning_rate=2e-5,
65
+ lr_scheduler_type="constant",
66
+ report_to="tensorboard",
67
+ per_device_train_batch_size=64,
68
+ per_device_eval_batch_size=32,
69
+ adam_beta1=0.9,
70
+ adam_beta2=0.98,
71
+ save_total_limit=1,
72
+ num_train_epochs=100,
73
+ fp16=True,
74
+ push_to_hub=False,
75
+ )
76
+
77
+ trainer = Trainer(
78
+ model=model,
79
+ args=training_args,
80
+ train_dataset=tokenized_dataset["train"],
81
+ eval_dataset=tokenized_dataset["test"],
82
+ tokenizer=tokenizer,
83
+ data_collator=data_collator,
84
+ )
85
+
86
+ trainer.train()
87
+
88
+ import math
89
+
90
+ eval_results = trainer.evaluate()
91
+ print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
92
+
utils.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ from typing import Dict
5
+ from torch.utils.data import Dataset
6
+ from datasets import Dataset as AdvancedDataset
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+
9
+
10
+ DEFAULT_TRAIN_DATA_NAME = "test_openprompt.json"
11
+ DEFAULT_TEST_DATA_NAME = "train_openprompt.json"
12
+ DEFAULT_DICT_DATA_NAME = "dataset_openprompt.json"
13
+
14
+ def get_open_prompt_data(path_for_data):
15
+ with open(os.path.join(path_for_data, DEFAULT_TRAIN_DATA_NAME)) as f:
16
+ train_data = json.load(f)
17
+
18
+ with open(os.path.join(path_for_data, DEFAULT_TEST_DATA_NAME)) as f:
19
+ test_data = json.load(f)
20
+
21
+ return train_data, test_data
22
+
23
+ def get_tok_and_model(path_for_model):
24
+ if not os.path.exists(path_for_model):
25
+ raise RuntimeError("no cached model.")
26
+ tok = AutoTokenizer.from_pretrained(path_for_model, padding_side='left')
27
+ tok.pad_token_id = 50256
28
+ # default for open-ended generation
29
+ model = AutoModelForCausalLM.from_pretrained(path_for_model)
30
+ return tok, model
31
+
32
+
33
+ class OpenPromptDataset(Dataset):
34
+ def __init__(self, data) -> None:
35
+ super().__init__()
36
+ self.data = data
37
+
38
+ def __len__(self):
39
+ return len(self.data)
40
+
41
+ def __getitem__(self, index):
42
+ return self.data[index]
43
+
44
+ def get_dataset(train_data, test_data):
45
+ train_dataset = OpenPromptDataset(train_data)
46
+ test_dataset = OpenPromptDataset(test_data)
47
+ return train_dataset, test_dataset
48
+
49
+ def get_dict_dataset(path_for_data):
50
+ with open(os.path.join(path_for_data, DEFAULT_DICT_DATA_NAME)) as f:
51
+ dict_data = json.load(f)
52
+ return dict_data
53
+
54
+ def get_advance_dataset(dict_data):
55
+ if not isinstance(dict_data, Dict):
56
+ raise RuntimeError("dict_data is not a dict.")
57
+ dataset = AdvancedDataset.from_dict(dict_data)
58
+
59
+ return dataset