|
import os |
|
import numpy as np |
|
from transformers import AutoTokenizer |
|
import random |
|
import argparse |
|
|
|
def parse_arguments(): |
|
parser = argparse.ArgumentParser(description='Process the text data for tokenization.') |
|
parser.add_argument("--data_dir", type=str, required=True, help="Directory of the raw data.") |
|
parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the trained AutoTokenizer.") |
|
parser.add_argument("--out_dir", type=str, required=True, help="Directory of output files.") |
|
parser.add_argument("--file_name", type=str, default="data.txt", required=True) |
|
parser.add_argument("--block_size", type=int, default=512, help="Max token length.") |
|
parser.add_argument("--is_start_with_eos", type=bool, default=False, help="Whether each line starts with `eos_token`.") |
|
parser.add_argument("--is_end_with_eos", type=bool, default=False, help="Whether each line ends with `eos_token`.") |
|
parser.add_argument("--split_ratio", type=float, default=0.99, help="Train-validation split ratio.") |
|
return parser.parse_args() |
|
|
|
def tokenize_and_save_lines(tokenizer, input_file, train_txt_file, val_txt_file, train_bin_file, val_bin_file,is_start_with_eos, is_end_with_eos, block_size, split_ratio): |
|
train_ids = [] |
|
val_ids = [] |
|
train_lines = [] |
|
val_lines = [] |
|
|
|
with open(input_file, 'r', encoding='utf-8') as f: |
|
lines = f.readlines() |
|
|
|
random.shuffle(lines) |
|
split_at = int(split_ratio * len(lines)) |
|
train_lines_list = lines[:split_at] |
|
val_lines_list = lines[split_at:] |
|
|
|
for i, line in enumerate(train_lines_list): |
|
ids = tokenizer.encode(line) |
|
if not is_end_with_eos: |
|
ids.append(0) |
|
elif not is_start_with_eos: |
|
ids.insert(0,0) |
|
|
|
if len(ids) < block_size: |
|
train_ids.extend(ids) |
|
train_lines.append(line.strip()) |
|
if i % 1000000 == 0: |
|
print(f"now processing {i}...") |
|
|
|
for i, line in enumerate(val_lines_list): |
|
ids = tokenizer.encode(line) |
|
if not is_end_with_eos: |
|
ids.append(0) |
|
elif not is_start_with_eos: |
|
ids.insert(0,0) |
|
|
|
if len(ids) <= block_size: |
|
val_ids.extend(ids) |
|
val_lines.append(line.strip()) |
|
|
|
|
|
save_tokenized_data(train_ids, train_bin_file) |
|
save_tokenized_data(val_ids, val_bin_file) |
|
print("Tokenized data saved...") |
|
|
|
|
|
save_text_data(train_lines, train_txt_file) |
|
save_text_data(val_lines, val_txt_file) |
|
print("Text data saved...") |
|
|
|
def save_tokenized_data(tokenized_data, file_path): |
|
np_data = np.array(tokenized_data, dtype=np.uint16) |
|
os.makedirs(os.path.dirname(file_path), exist_ok=True) |
|
np_data.tofile(file_path) |
|
|
|
def save_text_data(text_data, file_path): |
|
os.makedirs(os.path.dirname(file_path), exist_ok=True) |
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
for line in text_data: |
|
f.write(line + '\n') |
|
|
|
def main(): |
|
args = parse_arguments() |
|
|
|
|
|
raw_data_path = os.path.join(args.data_dir, args.file_name) |
|
train_txt_path = os.path.join(args.out_dir, 'train.txt') |
|
val_txt_path = os.path.join(args.out_dir, 'val.txt') |
|
train_bin_path = os.path.join(args.out_dir, 'train.bin') |
|
val_bin_path = os.path.join(args.out_dir, 'val.bin') |
|
print("Paths setup complete...") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path) |
|
tokenize_and_save_lines(tokenizer, raw_data_path, train_txt_path, val_txt_path, train_bin_path, val_bin_path, args.is_start_with_eos, args.is_end_with_eos, args.block_size, args.split_ratio) |
|
print("Tokenization and data saving") |
|
|
|
if __name__ == "__main__": |
|
main() |