Spaces:

xuxw98
/

TAPA

Runtime error

App Files Files Community

TAPA / scripts /prepare_any_text.py

xuxw98

Upload 58 files

7d52396 over 1 year ago

raw

history blame

2.97 kB

	"""Implementation derived from https://github.com/tloen/alpaca-lora"""
	import sys
	from pathlib import Path

	# support running without installing as a package
	wd = Path(__file__).parent.parent.resolve()
	sys.path.append(str(wd))

	import torch
	import requests
	import json
	from torch.utils.data import random_split
	from lit_llama.tokenizer import Tokenizer
	from tqdm import tqdm


	IGNORE_INDEX = -1

	DATA_FILE_NAME = "input.txt"


	def prepare(
	destination_path: Path = Path("data/any"),
	tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
	test_split_ratio: float = 0.9, # default 90% train, 10% validation
	max_seq_length: int = 256,
	seed: int = 42,
	data_file_name: str = DATA_FILE_NAME,
	) -> None:
	"""Prepare any dataset for finetuning (akin to Shakespheare full tuning).

	The output is a training and validation dataset saved as `train.pt` and `val.pt`,
	which stores the preprocessed and tokenized prompts and labels.
	"""

	destination_path.mkdir(parents=True, exist_ok=True)
	file_path = destination_path / data_file_name
	if not file_path.exists():
	raise AssertionError(f"{data_file_name} is provided by the user")

	# TODO: If we don't have the Meta weights, where do we get the tokenizer from?
	tokenizer = Tokenizer(tokenizer_path)

	data = []

	with open(file_path, "r") as input_file:
	for line in input_file.readlines():
	data.append(line)

	# Partition the dataset into train and test
	train_split_size = int(len(data) * test_split_ratio)
	test_split_size = len(data) - train_split_size
	train_set, test_set = random_split(
	data,
	lengths=(train_split_size, test_split_size),
	generator=torch.Generator().manual_seed(seed),
	)
	train_set, test_set = list(train_set), list(test_set)

	print(f"train has {len(train_set):,} samples")
	print(f"val has {len(test_set):,} samples")

	print("Processing train split ...")
	train_set = [
	prepare_line(line, tokenizer, max_seq_length) for line in tqdm(train_set)
	]
	torch.save(train_set, file_path.parent / "train.pt")

	print("Processing test split ...")
	test_set = [
	prepare_line(line, tokenizer, max_seq_length) for line in tqdm(test_set)
	]
	torch.save(test_set, file_path.parent / "test.pt")


	def prepare_line(line: str, tokenizer: Tokenizer, max_length: int):
	"""Processes a single sample.

	This function processes the line to produce the tokenized version of it.
	"""
	encoded_full_prompt = tokenize(tokenizer, line, max_length=max_length, eos=False)
	return {
	"input_ids": encoded_full_prompt,
	"labels": encoded_full_prompt,
	}


	def tokenize(
	tokenizer: Tokenizer, string: str, max_length: int, eos=True
	) -> torch.Tensor:
	return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)


	if __name__ == "__main__":
	from jsonargparse import CLI

	CLI(prepare)