TAPA / scripts /prepare_redpajama.py
xuxw98's picture
Upload 58 files
7d52396
import json
import glob
import os
from pathlib import Path
import sys
# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
import numpy as np
from tqdm import tqdm
from lit_llama import Tokenizer
import lit_llama.packed_dataset as packed_dataset
filenames_sample = [
"arxiv_sample.jsonl",
"book_sample.jsonl",
"c4_sample.jsonl",
"cc_2019-30_sample.jsonl",
"cc_2020-05_sample.jsonl",
"cc_2021-04_sample.jsonl",
"cc_2022-05_sample.jsonl",
"cc_2023-06_sample.jsonl",
"github_sample.jsonl",
"stackexchange_sample.jsonl",
"wikipedia_sample.jsonl",
]
filename_sets = {
"arxiv": "arxiv/arxiv*",
"book": "book/book*",
"c4": "c4/c4-train*",
"common_crawl": "common_crawl/*",
"github": "github/filtered*",
"stackexchange": "stackexchange/stackexchange*",
"wikipedia": "wikipedia/wiki*",
}
def prepare_sample(
source_path: Path,
tokenizer_path: Path,
destination_path: Path,
chunk_size: int,
match = ""
) -> None:
"""Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model)."""
destination_path.mkdir(parents=True, exist_ok=True)
tokenizer = Tokenizer(tokenizer_path)
for name in filenames_sample:
if match and match not in name:
continue
filepath = source_path / name
if not filepath.is_file():
raise RuntimeError(
f"Input file not found at {filepath}. \n"
"Make sure you download the data, e.g. wget -i https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through \n"
"https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T \n"
"https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n"
)
prefix, _ = os.path.splitext(name)
builder = packed_dataset.PackedDatasetBuilder(
outdir=destination_path,
prefix=prefix,
chunk_size=chunk_size,
sep_token=tokenizer.bos_id,
dtype="auto",
vocab_size=tokenizer.vocab_size,
)
print(f"Processing {name}")
with open(filepath, encoding="utf-8") as f:
for row in tqdm(f):
text = json.loads(row)["text"]
text_ids = tokenizer.encode(text)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
builder.write_reminder()
def prepare_full(
source_path: Path,
tokenizer_path: Path,
destination_path: Path,
chunk_size: int,
match: str = ""
) -> None:
"""Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model)."""
import zstandard as zstd
destination_path.mkdir(parents=True, exist_ok=True)
tokenizer = Tokenizer(tokenizer_path)
for set_name, pattern in filename_sets.items():
if match and match not in set_name:
continue
is_cc = set_name == "common_crawl"
filenames = glob.glob(os.path.join(source_path, pattern), recursive=True)
if not filenames:
raise RuntimeError(
f"No files matching {pattern} found at {source_path}. \n"
"Make sure you download the data, e.g. wget -i https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through \n"
"https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T \n"
"https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n"
)
builder = packed_dataset.PackedDatasetBuilder(
outdir=destination_path,
prefix=set_name,
chunk_size=chunk_size,
sep_token=tokenizer.bos_id,
dtype="auto",
vocab_size=tokenizer.vocab_size,
)
for name in filenames:
filepath = source_path / name
print(f"Processing {name}")
if is_cc:
with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
for row in tqdm(f):
text = json.loads(row)["text"]
text_ids = tokenizer.encode(text)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
else:
with open(filepath, encoding="utf-8") as f:
for row in tqdm(f):
text = json.loads(row)["text"]
text_ids = tokenizer.encode(text)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
builder.write_reminder()
def prepare(
source_path: Path = Path("data/RedPajama-Data-1T-Sample"),
tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
destination_path: Path = Path("data/red_pajama_sample"),
chunk_size: int = 2049 * 1024, # 2048 block size + 1 for causal (from LLama), 1024 blocks
sample: bool = False,
match: str = "",
) -> None:
"""Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model)."""
if sample:
prepare_sample(
source_path=source_path,
tokenizer_path=tokenizer_path,
destination_path=destination_path,
chunk_size=chunk_size,
match=match,
)
else:
prepare_full(
source_path=source_path,
tokenizer_path=tokenizer_path,
destination_path=destination_path,
chunk_size=chunk_size,
match=match,
)
if __name__ == "__main__":
from jsonargparse import CLI
CLI(prepare)