|
"""Implementation derived from https://github.com/tloen/alpaca-lora""" |
|
import sys |
|
from pathlib import Path |
|
|
|
|
|
wd = Path(__file__).parent.parent.resolve() |
|
sys.path.append(str(wd)) |
|
|
|
import torch |
|
import requests |
|
import json |
|
from torch.utils.data import random_split |
|
from lit_llama.tokenizer import Tokenizer |
|
from tqdm import tqdm |
|
|
|
|
|
DATA_FILE = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl" |
|
DATA_FILE_NAME = "dolly_data_cleaned.json" |
|
IGNORE_INDEX = -1 |
|
|
|
|
|
def prepare( |
|
destination_path: Path = Path("data/dolly"), |
|
tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"), |
|
test_split_size: int = 2000, |
|
max_seq_length: int = 1024, |
|
seed: int = 42, |
|
mask_inputs: bool = False, |
|
) -> None: |
|
"""Prepare the Dolly dataset for instruction tuning. |
|
|
|
The output is a training and validation dataset saved as `train.pt` and `val.pt`, |
|
which stores the preprocessed and tokenized prompts and labels. |
|
""" |
|
|
|
destination_path.mkdir(parents=True, exist_ok=True) |
|
file_path = destination_path / DATA_FILE_NAME |
|
download(file_path) |
|
|
|
|
|
tokenizer = Tokenizer(tokenizer_path) |
|
|
|
with open(file_path, "r") as file: |
|
data = file.readlines() |
|
data = [json.loads(line) for line in data] |
|
for item in data: |
|
item["input"] = item.pop("context") |
|
item["output"] = item.pop("response") |
|
|
|
|
|
train_split_size = len(data) - test_split_size |
|
train_set, test_set = random_split( |
|
data, |
|
lengths=(train_split_size, test_split_size), |
|
generator=torch.Generator().manual_seed(seed), |
|
) |
|
train_set, test_set = list(train_set), list(test_set) |
|
|
|
print(f"train has {len(train_set):,} samples") |
|
print(f"val has {len(test_set):,} samples") |
|
|
|
print("Processing train split ...") |
|
train_set = [prepare_sample(sample, tokenizer, max_seq_length, mask_inputs) for sample in tqdm(train_set)] |
|
torch.save(train_set, file_path.parent / "train.pt") |
|
|
|
print("Processing test split ...") |
|
test_set = [prepare_sample(sample, tokenizer, max_seq_length, mask_inputs) for sample in tqdm(test_set)] |
|
torch.save(test_set, file_path.parent / "test.pt") |
|
|
|
|
|
def download(file_path: Path): |
|
"""Downloads the raw json data file and saves it in the given destination.""" |
|
if file_path.exists(): |
|
return |
|
with open(file_path, "w") as f: |
|
f.write(requests.get(DATA_FILE).text) |
|
|
|
|
|
def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool = True): |
|
"""Processes a single sample. |
|
|
|
Each sample in the dataset consists of: |
|
- instruction: A string describing the task |
|
- input: A string holding a special input value for the instruction. |
|
This only applies to some samples, and in others this is empty. |
|
- output: The response string |
|
|
|
This function processes this data to produce a prompt text and a label for |
|
supervised training. The prompt text is formed as a single message including both |
|
the instruction and the input. The label/target is the same message but with the |
|
response attached. |
|
|
|
Finally, both the prompt and the label get tokenized. If desired, all tokens |
|
in the label that correspond to the original input prompt get masked out (default). |
|
""" |
|
full_prompt = generate_prompt(example) |
|
full_prompt_and_response = full_prompt + example["output"] |
|
encoded_full_prompt = tokenize(tokenizer, full_prompt, max_length=max_length, eos=False) |
|
encoded_full_prompt_and_response = tokenize(tokenizer, full_prompt_and_response, eos=True, max_length=max_length) |
|
|
|
|
|
labels = encoded_full_prompt_and_response.clone() |
|
if mask_inputs: |
|
labels[:len(encoded_full_prompt)] = IGNORE_INDEX |
|
|
|
return {**example, "input_ids": encoded_full_prompt_and_response, "input_ids_no_response": encoded_full_prompt, "labels": labels} |
|
|
|
|
|
def tokenize(tokenizer: Tokenizer, string: str, max_length: int, eos=True) -> torch.Tensor: |
|
return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length) |
|
|
|
|
|
def generate_prompt(example): |
|
"""Generates a standardized message to prompt the model with an instruction, optional input and a |
|
'response' field.""" |
|
|
|
if example["input"]: |
|
return ( |
|
f"Below is an instruction that describes a task, paired with an input that provides further context. " |
|
"Write a response that appropriately completes the request.\n\n" |
|
f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" |
|
) |
|
return ( |
|
f"Below is an instruction that describes a task. " |
|
"Write a response that appropriately completes the request.\n\n" |
|
f"### Instruction:\n{example['instruction']}\n\n### Response:" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
from jsonargparse import CLI |
|
|
|
CLI(prepare) |
|
|