MasaakiKotera
commited on
Commit
•
1b3f51e
1
Parent(s):
8f79412
Upload tokenization.py with huggingface_hub
Browse files- tokenization.py +61 -54
tokenization.py
CHANGED
@@ -9,80 +9,87 @@ def parse_arguments():
|
|
9 |
parser.add_argument("--data_dir", type=str, required=True, help="Directory of the raw data.")
|
10 |
parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the trained AutoTokenizer.")
|
11 |
parser.add_argument("--out_dir", type=str, required=True, help="Directory of output files.")
|
12 |
-
parser.add_argument("--
|
|
|
|
|
|
|
|
|
13 |
return parser.parse_args()
|
14 |
|
15 |
-
def
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
17 |
lines = f.readlines()
|
18 |
-
|
19 |
random.shuffle(lines)
|
20 |
split_at = int(split_ratio * len(lines))
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
for line in lines:
|
26 |
-
f.write(line.replace(' ', ''))
|
27 |
-
|
28 |
-
def tokenize_lines(tokenizer, lines, end_with_eos, block_size = 1e10):
|
29 |
-
tokenized_ids = []
|
30 |
-
for i, line in enumerate(lines):
|
31 |
-
if not end_with_eos:
|
32 |
-
line = line.strip() + tokenizer.eos_token
|
33 |
ids = tokenizer.encode(line)
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def save_tokenized_data(tokenized_data, file_path):
|
44 |
np_data = np.array(tokenized_data, dtype=np.uint16)
|
45 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
46 |
np_data.tofile(file_path)
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def main():
|
49 |
-
|
50 |
-
parser.add_argument("--data_dir", type=str, required=True, help="Directory of raw data & output files")
|
51 |
-
parser.add_argument("--file_name", type=str, default="data.txt",required=True)
|
52 |
-
parser.add_argument("--out_dir", type=str, required=False, help="directory of output files(default=data_dir). A train.bin and a valid.bin will be built and expect to be used in train.py")
|
53 |
-
parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to a trained AutoTokenizer")
|
54 |
-
parser.add_argument("--block_size", type=str, required=True, help="Max token length")
|
55 |
-
args = parser.parse_args()
|
56 |
|
57 |
# Paths setup
|
58 |
-
if args.out_dir is None:
|
59 |
-
out_dir = args.data_dir
|
60 |
-
else:
|
61 |
-
out_dir = args.out_dir
|
62 |
raw_data_path = os.path.join(args.data_dir, args.file_name)
|
63 |
-
train_txt_path = os.path.join(out_dir, 'train.txt')
|
64 |
-
val_txt_path = os.path.join(out_dir, 'val.txt')
|
65 |
-
train_bin_path = os.path.join(out_dir, 'train.bin')
|
66 |
-
val_bin_path = os.path.join(out_dir, 'val.bin')
|
67 |
print("Paths setup complete...")
|
68 |
|
69 |
-
# Data preparation
|
70 |
-
train_lines, val_lines = shuffle_and_split_data(raw_data_path)
|
71 |
-
write_to_file(train_txt_path, train_lines)
|
72 |
-
write_to_file(val_txt_path, val_lines)
|
73 |
-
print("Data preparation complete...")
|
74 |
-
|
75 |
# Tokenization
|
76 |
-
end_with_eos = False
|
77 |
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
|
78 |
-
|
79 |
-
|
80 |
-
print("Tokenization complete...")
|
81 |
-
|
82 |
-
# Save tokenized data
|
83 |
-
save_tokenized_data(train_ids, train_bin_path)
|
84 |
-
save_tokenized_data(val_ids, val_bin_path)
|
85 |
-
print("Tokenized data saved...")
|
86 |
|
87 |
if __name__ == "__main__":
|
88 |
-
main()
|
|
|
9 |
parser.add_argument("--data_dir", type=str, required=True, help="Directory of the raw data.")
|
10 |
parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the trained AutoTokenizer.")
|
11 |
parser.add_argument("--out_dir", type=str, required=True, help="Directory of output files.")
|
12 |
+
parser.add_argument("--file_name", type=str, default="data.txt", required=True)
|
13 |
+
parser.add_argument("--block_size", type=int, default=512, help="Max token length.")
|
14 |
+
parser.add_argument("--is_start_with_eos", type=bool, default=False, help="Whether each line starts with `eos_token`.")
|
15 |
+
parser.add_argument("--is_end_with_eos", type=bool, default=False, help="Whether each line ends with `eos_token`.")
|
16 |
+
parser.add_argument("--split_ratio", type=float, default=0.99, help="Train-validation split ratio.")
|
17 |
return parser.parse_args()
|
18 |
|
19 |
+
def tokenize_and_save_lines(tokenizer, input_file, train_txt_file, val_txt_file, train_bin_file, val_bin_file,is_start_with_eos, is_end_with_eos, block_size, split_ratio):
|
20 |
+
train_ids = []
|
21 |
+
val_ids = []
|
22 |
+
train_lines = []
|
23 |
+
val_lines = []
|
24 |
+
|
25 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
26 |
lines = f.readlines()
|
27 |
+
|
28 |
random.shuffle(lines)
|
29 |
split_at = int(split_ratio * len(lines))
|
30 |
+
train_lines_list = lines[:split_at]
|
31 |
+
val_lines_list = lines[split_at:]
|
32 |
+
|
33 |
+
for i, line in enumerate(train_lines_list):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
ids = tokenizer.encode(line)
|
35 |
+
if not is_end_with_eos:
|
36 |
+
ids.append(0)
|
37 |
+
elif not is_start_with_eos:
|
38 |
+
ids.insert(0,0)
|
39 |
|
40 |
+
if len(ids) < block_size:
|
41 |
+
train_ids.extend(ids)
|
42 |
+
train_lines.append(line.strip())
|
43 |
+
if i % 1000000 == 0:
|
44 |
+
print(f"now processing {i}...")
|
45 |
+
|
46 |
+
for i, line in enumerate(val_lines_list):
|
47 |
+
ids = tokenizer.encode(line)
|
48 |
+
if not is_end_with_eos:
|
49 |
+
ids.append(0)
|
50 |
+
elif not is_start_with_eos:
|
51 |
+
ids.insert(0,0)
|
52 |
+
|
53 |
+
if len(ids) <= block_size:
|
54 |
+
val_ids.extend(ids)
|
55 |
+
val_lines.append(line.strip())
|
56 |
+
|
57 |
+
# Save tokenized data
|
58 |
+
save_tokenized_data(train_ids, train_bin_file)
|
59 |
+
save_tokenized_data(val_ids, val_bin_file)
|
60 |
+
print("Tokenized data saved...")
|
61 |
+
|
62 |
+
# Save text data
|
63 |
+
save_text_data(train_lines, train_txt_file)
|
64 |
+
save_text_data(val_lines, val_txt_file)
|
65 |
+
print("Text data saved...")
|
66 |
|
67 |
def save_tokenized_data(tokenized_data, file_path):
|
68 |
np_data = np.array(tokenized_data, dtype=np.uint16)
|
69 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
70 |
np_data.tofile(file_path)
|
71 |
|
72 |
+
def save_text_data(text_data, file_path):
|
73 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
74 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
75 |
+
for line in text_data:
|
76 |
+
f.write(line + '\n')
|
77 |
+
|
78 |
def main():
|
79 |
+
args = parse_arguments()
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
# Paths setup
|
|
|
|
|
|
|
|
|
82 |
raw_data_path = os.path.join(args.data_dir, args.file_name)
|
83 |
+
train_txt_path = os.path.join(args.out_dir, 'train.txt')
|
84 |
+
val_txt_path = os.path.join(args.out_dir, 'val.txt')
|
85 |
+
train_bin_path = os.path.join(args.out_dir, 'train.bin')
|
86 |
+
val_bin_path = os.path.join(args.out_dir, 'val.bin')
|
87 |
print("Paths setup complete...")
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
# Tokenization
|
|
|
90 |
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
|
91 |
+
tokenize_and_save_lines(tokenizer, raw_data_path, train_txt_path, val_txt_path, train_bin_path, val_bin_path, args.is_start_with_eos, args.is_end_with_eos, args.block_size, args.split_ratio)
|
92 |
+
print("Tokenization and data saving")
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
if __name__ == "__main__":
|
95 |
+
main()
|