Spaces:
Runtime error
Runtime error
File size: 6,519 Bytes
03561be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import copy
import json
import numpy as np
from torch.utils.data import Dataset
from transformers import LlamaTokenizer
TEMPLATE = {
"description": "Template used by LLM.",
"prompt_no_input_format": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
"prompt_with_input_format": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
"response_split": "### Response:",
}
class LMPrompter:
def __call__(self, instruction, input=None):
if input is None or len(input) == 0:
return TEMPLATE["prompt_no_input_format"].format(instruction=instruction)
else:
return TEMPLATE["prompt_with_input_format"].format(instruction=instruction, input=input)
def get_response(self, output: str) -> str:
return output.split(TEMPLATE["response_split"])[-1].strip()
class DollyDataset(Dataset):
"""Each line of the annotation file is a json object with the following fields:
{
"instruction": "What is a dispersive prism?",
"context": "In optics, a dispersive prism is an optical prism that is used to disperse light, that is, to separate light into its spectral components (the colors of the rainbow). Different wavelengths (colors) of light will be deflected by the prism at different angles.[1] This is a result of the prism material's index of refraction varying with wavelength (dispersion). Generally, longer wavelengths (red) undergo a smaller deviation than shorter wavelengths (blue). The dispersion of white light into colors by a prism led Sir Isaac Newton to conclude that white light consisted of a mixture of different colors.",
"response": "A dispersive prism is an optical prism that disperses the light's different wavelengths at different angles. When white light is shined through a dispersive prism it will separate into the different colors of the rainbow.",
"category": "summarization"
}
"""
def __init__(self, tokenizer, ann_path: str, add_eos=True, ignore_instruction=True, **kwargs):
"""
ann_path (string): directory to store the annotation file
"""
assert tokenizer.add_eos_token is False, "tokenizer should not add eos token by default"
self.tokenizer: LlamaTokenizer = tokenizer
self.annotation = []
self.prompter = LMPrompter()
self.add_eos = add_eos
self.ignore_instruction = ignore_instruction
self.load_annotation(ann_path)
def load_annotation(self, ann_path):
self.annotation = []
for line in open(ann_path, "r").readlines():
self.annotation.append(json.loads(line))
def __len__(self):
return len(self.annotation)
def process_text(self, ann):
instruction = ann["instruction"]
context = ann["context"]
response = ann["response"]
instruction = self.prompter(instruction=instruction, input=context)
return dict(instruction=instruction, answer=response)
def tokenize(self, text):
res = self.tokenizer(
text["instruction"] + text["answer"],
return_tensors=None,
padding="do_not_pad",
truncation=True,
max_length=512,
)
# manually add eos token
if res["input_ids"][-1] != self.tokenizer.eos_token_id and len(res["input_ids"]) < 512 and self.add_eos:
res["input_ids"].append(self.tokenizer.eos_token_id)
res["attention_mask"].append(1)
labels = copy.deepcopy(res["input_ids"])
# ignore instruction_token
if self.ignore_instruction:
instruction_token = self.tokenizer(
text["instruction"], return_tensors=None, padding="do_not_pad", truncation=True, max_length=512
)
labels = [-100] * len(instruction_token["input_ids"]) + labels[len(instruction_token["input_ids"]) :]
res.update(labels=labels)
return res
def __getitem__(self, index):
ann = self.annotation[index]
text = self.process_text(ann)
res = self.tokenize(text)
res.update(text)
return res
def collater(self, samples):
question_list, answer_list, input_id_list, attention_mask_list, labels_list = [], [], [], [], []
for sample in samples:
question_list.append(sample["instruction"])
answer_list.append(sample["answer"])
input_id_list.append(sample["input_ids"])
attention_mask_list.append(sample["attention_mask"])
labels_list.append(sample["labels"])
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
# same length to return tensors.
max_label_length = max(len(l) for l in labels_list)
padding_side = self.tokenizer.padding_side
padded_labels = []
for l in labels_list:
remainder = [-100] * (max_label_length - len(l))
if isinstance(l, list):
l = l + remainder if padding_side == "right" else remainder + l
elif padding_side == "right":
l = np.concatenate([l, remainder]).astype(np.int64)
else:
l = np.concatenate([remainder, l]).astype(np.int64)
padded_labels.append(l)
padded_samples = self.tokenizer.pad(
{"input_ids": input_id_list, "attention_mask": attention_mask_list, "labels": padded_labels},
return_tensors="pt",
padding="longest",
)
labels = padded_samples["labels"]
labels[labels == self.tokenizer.pad_token_id] = -100
labels[:, 0] = -100
return {
"input_ids": padded_samples["input_ids"],
"attention_mask": padded_samples["attention_mask"],
"labels": labels,
"instruction": question_list,
"answer": answer_list,
}
def build_dolly_dataset(
tokenizer,
ann_path="data/dolly/databricks-dolly-15k.jsonl",
**kwargs,
):
return DollyDataset(
tokenizer=tokenizer,
ann_path=ann_path,
**kwargs,
)
|