Spaces:
Configuration error
Configuration error
""" | |
batch_selfinstruct_generate.py | |
run: | |
python -m generate_instruction generate_instruction_following_data \ | |
--output_dir ./ \ | |
--num_instructions_to_generate 10 \ | |
--model_name="text-davinci-003" \ | |
""" | |
import time | |
import json | |
import os | |
import random | |
import re | |
import string | |
from functools import partial | |
from multiprocessing import Pool | |
import numpy as np | |
import tqdm | |
from rouge_score import rouge_scorer | |
import utils | |
import fire | |
def encode_prompt(prompt_instructions): | |
"""Encode multiple prompt instructions into a single string.""" | |
prompt = open("./prompt.txt").read() + "\n" | |
for idx, task_dict in enumerate(prompt_instructions): | |
(instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"] | |
instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":") | |
input = "<noinput>" if input.lower() == "" else input | |
prompt += f"###\n" | |
prompt += f"{idx + 1}. Instruction: {instruction}\n" | |
prompt += f"{idx + 1}. Input:\n{input}\n" | |
prompt += f"{idx + 1}. Output:\n{output}\n" | |
prompt += f"###\n" | |
prompt += f"{idx + 2}. Instruction:" | |
return prompt | |
def post_process_gpt3_response(num_prompt_instructions, response): | |
if response is None: | |
return [] | |
raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"] | |
raw_instructions = re.split("###", raw_instructions) | |
instructions = [] | |
for idx, inst in enumerate(raw_instructions): | |
# if the decoding stops due to length, the last example is likely truncated so we discard it | |
if idx == len(raw_instructions) - 1 and response["finish_reason"] == "length": | |
continue | |
idx += num_prompt_instructions + 1 | |
splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst) | |
if len(splitted_data) != 7: | |
continue | |
else: | |
inst = splitted_data[2].strip() | |
input = splitted_data[4].strip() | |
input = "" if input.lower() == "<noinput>" else input | |
output = splitted_data[6].strip() | |
# filter out too short or too long instructions | |
if len(inst.split()) <= 3 or len(inst.split()) > 150: | |
continue | |
# filter based on keywords that are not suitable for language models. | |
blacklist = [ | |
"image", | |
"images", | |
"graph", | |
"graphs", | |
"picture", | |
"pictures", | |
"file", | |
"files", | |
"map", | |
"maps", | |
"draw", | |
"plot", | |
"go to", | |
"video", | |
"audio", | |
"music", | |
"flowchart", | |
"diagram", | |
] | |
blacklist += [] | |
if any(find_word_in_string(word, inst) for word in blacklist): | |
continue | |
# We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions. | |
# And it's a bit comfusing whether the model need to write a program or directly output the result. | |
# Here we filter them out. | |
# Note this is not a comprehensive filtering for all programming instructions. | |
if inst.startswith("Write a program"): | |
continue | |
# filter those starting with punctuation | |
if inst[0] in string.punctuation: | |
continue | |
# filter those starting with non-english character | |
if not inst[0].isascii(): | |
continue | |
instructions.append({"instruction": inst, "input": input, "output": output}) | |
return instructions | |
def find_word_in_string(w, s): | |
return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s) | |
def generate_instruction_following_data( | |
output_dir="./", | |
seed_tasks_path="./seed_tasks.jsonl", | |
num_instructions_to_generate=100, | |
model_name="text-davinci-003", | |
num_prompt_instructions=3, | |
request_batch_size=5, | |
temperature=1.0, | |
top_p=1.0, | |
num_cpus=16, | |
): | |
seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")] | |
seed_instruction_data = [ | |
{"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]} | |
for t in seed_tasks | |
] | |
print(f"Loaded {len(seed_instruction_data)} human-written seed instructions") | |
os.makedirs(output_dir, exist_ok=True) | |
request_idx = 0 | |
# load the LM-generated instructions | |
machine_instruction_data = [] | |
if os.path.exists(os.path.join(output_dir, "regen.json")): | |
machine_instruction_data = utils.jload(os.path.join(output_dir, "regen.json")) | |
print(f"Loaded {len(machine_instruction_data)} machine-generated instructions") | |
# similarities = {} | |
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False) | |
# now let's generate new instructions! | |
progress_bar = tqdm.tqdm(total=num_instructions_to_generate) | |
if machine_instruction_data: | |
progress_bar.update(len(machine_instruction_data)) | |
# first we tokenize all the seed instructions and generated machine instructions | |
all_instructions = [d["instruction"] for d in seed_instruction_data] + [ | |
d["instruction"] for d in machine_instruction_data | |
] | |
all_instruction_tokens = [scorer._tokenizer.tokenize(inst) for inst in all_instructions] | |
while len(machine_instruction_data) < num_instructions_to_generate: | |
request_idx += 1 | |
batch_inputs = [] | |
for _ in range(request_batch_size): | |
# only sampling from the seed tasks | |
prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions) | |
prompt = encode_prompt(prompt_instructions) | |
batch_inputs.append(prompt) | |
decoding_args = utils.OpenAIDecodingArguments( | |
temperature=temperature, | |
n=1, | |
max_tokens=3072, # hard-code to maximize the length. the requests will be automatically adjusted | |
top_p=top_p, | |
stop=["\n20", "20.", "20."], | |
) | |
request_start = time.time() | |
results = utils.openai_completion( | |
prompts=batch_inputs, | |
model_name=model_name, | |
batch_size=request_batch_size, | |
decoding_args=decoding_args, | |
logit_bias={"50256": -100}, # prevent the <|endoftext|> token from being generated | |
) | |
request_duration = time.time() - request_start | |
process_start = time.time() | |
instruction_data = [] | |
for result in results: | |
new_instructions = post_process_gpt3_response(num_prompt_instructions, result) | |
instruction_data += new_instructions | |
total = len(instruction_data) | |
keep = 0 | |
for instruction_data_entry in instruction_data: | |
# computing similarity with the pre-tokenzied instructions | |
new_instruction_tokens = scorer._tokenizer.tokenize(instruction_data_entry["instruction"]) | |
with Pool(num_cpus) as p: | |
rouge_scores = p.map( | |
partial(rouge_scorer._score_lcs, new_instruction_tokens), | |
all_instruction_tokens, | |
) | |
rouge_scores = [score.fmeasure for score in rouge_scores] | |
most_similar_instructions = { | |
all_instructions[i]: rouge_scores[i] for i in np.argsort(rouge_scores)[-10:][::-1] | |
} | |
if max(rouge_scores) > 0.7: | |
continue | |
else: | |
keep += 1 | |
instruction_data_entry["most_similar_instructions"] = most_similar_instructions | |
instruction_data_entry["avg_similarity_score"] = float(np.mean(rouge_scores)) | |
machine_instruction_data.append(instruction_data_entry) | |
all_instructions.append(instruction_data_entry["instruction"]) | |
all_instruction_tokens.append(new_instruction_tokens) | |
progress_bar.update(1) | |
process_duration = time.time() - process_start | |
print(f"Request {request_idx} took {request_duration:.2f}s, processing took {process_duration:.2f}s") | |
print(f"Generated {total} instructions, kept {keep} instructions") | |
utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json")) | |
def main(task, **kwargs): | |
globals()[task](**kwargs) | |
if __name__ == "__main__": | |
fire.Fire(main) | |