Spaces:

xuxw98
/

TAPA

Runtime error

App Files Files Community

TAPA / scripts /prepare_dolly.py

xuxw98

Upload 58 files

7d52396 over 1 year ago

raw

history blame contribute delete

5.16 kB

	"""Implementation derived from https://github.com/tloen/alpaca-lora"""
	import sys
	from pathlib import Path

	# support running without installing as a package
	wd = Path(__file__).parent.parent.resolve()
	sys.path.append(str(wd))

	import torch
	import requests
	import json
	from torch.utils.data import random_split
	from lit_llama.tokenizer import Tokenizer
	from tqdm import tqdm


	DATA_FILE = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl"
	DATA_FILE_NAME = "dolly_data_cleaned.json"
	IGNORE_INDEX = -1


	def prepare(
	destination_path: Path = Path("data/dolly"),
	tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
	test_split_size: int = 2000,
	max_seq_length: int = 1024,
	seed: int = 42,
	mask_inputs: bool = False, # as in alpaca-lora
	) -> None:
	"""Prepare the Dolly dataset for instruction tuning.

	The output is a training and validation dataset saved as `train.pt` and `val.pt`,
	which stores the preprocessed and tokenized prompts and labels.
	"""

	destination_path.mkdir(parents=True, exist_ok=True)
	file_path = destination_path / DATA_FILE_NAME
	download(file_path)

	# TODO: If we don't have the Meta weights, where do we get the tokenizer from?
	tokenizer = Tokenizer(tokenizer_path)

	with open(file_path, "r") as file:
	data = file.readlines()
	data = [json.loads(line) for line in data]
	for item in data:
	item["input"] = item.pop("context")
	item["output"] = item.pop("response")

	# Partition the dataset into train and test
	train_split_size = len(data) - test_split_size
	train_set, test_set = random_split(
	data,
	lengths=(train_split_size, test_split_size),
	generator=torch.Generator().manual_seed(seed),
	)
	train_set, test_set = list(train_set), list(test_set)

	print(f"train has {len(train_set):,} samples")
	print(f"val has {len(test_set):,} samples")

	print("Processing train split ...")
	train_set = [prepare_sample(sample, tokenizer, max_seq_length, mask_inputs) for sample in tqdm(train_set)]
	torch.save(train_set, file_path.parent / "train.pt")

	print("Processing test split ...")
	test_set = [prepare_sample(sample, tokenizer, max_seq_length, mask_inputs) for sample in tqdm(test_set)]
	torch.save(test_set, file_path.parent / "test.pt")


	def download(file_path: Path):
	"""Downloads the raw json data file and saves it in the given destination."""
	if file_path.exists():
	return
	with open(file_path, "w") as f:
	f.write(requests.get(DATA_FILE).text)


	def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool = True):
	"""Processes a single sample.

	Each sample in the dataset consists of:
	- instruction: A string describing the task
	- input: A string holding a special input value for the instruction.
	This only applies to some samples, and in others this is empty.
	- output: The response string

	This function processes this data to produce a prompt text and a label for
	supervised training. The prompt text is formed as a single message including both
	the instruction and the input. The label/target is the same message but with the
	response attached.

	Finally, both the prompt and the label get tokenized. If desired, all tokens
	in the label that correspond to the original input prompt get masked out (default).
	"""
	full_prompt = generate_prompt(example)
	full_prompt_and_response = full_prompt + example["output"]
	encoded_full_prompt = tokenize(tokenizer, full_prompt, max_length=max_length, eos=False)
	encoded_full_prompt_and_response = tokenize(tokenizer, full_prompt_and_response, eos=True, max_length=max_length)

	# The labels are the full prompt with response, but with the prompt masked out
	labels = encoded_full_prompt_and_response.clone()
	if mask_inputs:
	labels[:len(encoded_full_prompt)] = IGNORE_INDEX

	return {**example, "input_ids": encoded_full_prompt_and_response, "input_ids_no_response": encoded_full_prompt, "labels": labels}


	def tokenize(tokenizer: Tokenizer, string: str, max_length: int, eos=True) -> torch.Tensor:
	return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)


	def generate_prompt(example):
	"""Generates a standardized message to prompt the model with an instruction, optional input and a
	'response' field."""

	if example["input"]:
	return (
	f"Below is an instruction that describes a task, paired with an input that provides further context. "
	"Write a response that appropriately completes the request.\n\n"
	f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
	)
	return (
	f"Below is an instruction that describes a task. "
	"Write a response that appropriately completes the request.\n\n"
	f"### Instruction:\n{example['instruction']}\n\n### Response:"
	)


	if __name__ == "__main__":
	from jsonargparse import CLI

	CLI(prepare)