|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import os |
|
import os.path as op |
|
from collections import namedtuple |
|
from multiprocessing import cpu_count |
|
from typing import List, Optional |
|
|
|
import sentencepiece as sp |
|
from fairseq.data.encoders.byte_bpe import ByteBPE |
|
from fairseq.data.encoders.byte_utils import byte_encode |
|
from fairseq.data.encoders.bytes import Bytes |
|
from fairseq.data.encoders.characters import Characters |
|
from fairseq.data.encoders.moses_tokenizer import MosesTokenizer |
|
from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE |
|
|
|
|
|
SPLITS = ["train", "valid", "test"] |
|
|
|
|
|
def _convert_xml(in_path: str, out_path: str): |
|
with open(in_path) as f, open(out_path, "w") as f_o: |
|
for s in f: |
|
ss = s.strip() |
|
if not ss.startswith("<seg"): |
|
continue |
|
ss = ss.replace("</seg>", "").split('">') |
|
assert len(ss) == 2 |
|
f_o.write(ss[1].strip() + "\n") |
|
|
|
|
|
def _convert_train(in_path: str, out_path: str): |
|
with open(in_path) as f, open(out_path, "w") as f_o: |
|
for s in f: |
|
ss = s.strip() |
|
if ss.startswith("<"): |
|
continue |
|
f_o.write(ss.strip() + "\n") |
|
|
|
|
|
def _get_bytes(in_path: str, out_path: str): |
|
with open(in_path) as f, open(out_path, "w") as f_o: |
|
for s in f: |
|
f_o.write(Bytes.encode(s.strip()) + "\n") |
|
|
|
|
|
def _get_chars(in_path: str, out_path: str): |
|
with open(in_path) as f, open(out_path, "w") as f_o: |
|
for s in f: |
|
f_o.write(Characters.encode(s.strip()) + "\n") |
|
|
|
|
|
def pretokenize(in_path: str, out_path: str, src: str, tgt: str): |
|
Args = namedtuple( |
|
"Args", |
|
[ |
|
"moses_source_lang", |
|
"moses_target_lang", |
|
"moses_no_dash_splits", |
|
"moses_no_escape", |
|
], |
|
) |
|
args = Args( |
|
moses_source_lang=src, |
|
moses_target_lang=tgt, |
|
moses_no_dash_splits=False, |
|
moses_no_escape=False, |
|
) |
|
pretokenizer = MosesTokenizer(args) |
|
with open(in_path) as f, open(out_path, "w") as f_o: |
|
for s in f: |
|
f_o.write(pretokenizer.encode(s.strip()) + "\n") |
|
|
|
|
|
def _convert_to_bchar(in_path_prefix: str, src: str, tgt: str, out_path: str): |
|
with open(out_path, "w") as f_o: |
|
for lang in [src, tgt]: |
|
with open(f"{in_path_prefix}.{lang}") as f: |
|
for s in f: |
|
f_o.write(byte_encode(s.strip()) + "\n") |
|
|
|
|
|
def _get_bpe(in_path: str, model_prefix: str, vocab_size: int): |
|
arguments = [ |
|
f"--input={in_path}", |
|
f"--model_prefix={model_prefix}", |
|
f"--model_type=bpe", |
|
f"--vocab_size={vocab_size}", |
|
"--character_coverage=1.0", |
|
"--normalization_rule_name=identity", |
|
f"--num_threads={cpu_count()}", |
|
] |
|
sp.SentencePieceTrainer.Train(" ".join(arguments)) |
|
|
|
|
|
def _apply_bbpe(model_path: str, in_path: str, out_path: str): |
|
Args = namedtuple("Args", ["sentencepiece_model_path"]) |
|
args = Args(sentencepiece_model_path=model_path) |
|
tokenizer = ByteBPE(args) |
|
with open(in_path) as f, open(out_path, "w") as f_o: |
|
for s in f: |
|
f_o.write(tokenizer.encode(s.strip()) + "\n") |
|
|
|
|
|
def _apply_bpe(model_path: str, in_path: str, out_path: str): |
|
Args = namedtuple("Args", ["sentencepiece_model"]) |
|
args = Args(sentencepiece_model=model_path) |
|
tokenizer = SentencepieceBPE(args) |
|
with open(in_path) as f, open(out_path, "w") as f_o: |
|
for s in f: |
|
f_o.write(tokenizer.encode(s.strip()) + "\n") |
|
|
|
|
|
def _concat_files(in_paths: List[str], out_path: str): |
|
with open(out_path, "w") as f_o: |
|
for p in in_paths: |
|
with open(p) as f: |
|
for r in f: |
|
f_o.write(r) |
|
|
|
|
|
def preprocess_iwslt17( |
|
root: str, |
|
src: str, |
|
tgt: str, |
|
bpe_size: Optional[int], |
|
need_chars: bool, |
|
bbpe_size: Optional[int], |
|
need_bytes: bool, |
|
): |
|
|
|
in_root = op.join(root, f"{src}-{tgt}") |
|
for lang in [src, tgt]: |
|
_convert_train( |
|
op.join(in_root, f"train.tags.{src}-{tgt}.{lang}"), |
|
op.join(root, f"train.{lang}"), |
|
) |
|
_convert_xml( |
|
op.join(in_root, f"IWSLT17.TED.dev2010.{src}-{tgt}.{lang}.xml"), |
|
op.join(root, f"valid.{lang}"), |
|
) |
|
_convert_xml( |
|
op.join(in_root, f"IWSLT17.TED.tst2015.{src}-{tgt}.{lang}.xml"), |
|
op.join(root, f"test.{lang}"), |
|
) |
|
|
|
for lang in [src, tgt]: |
|
for split in SPLITS: |
|
pretokenize( |
|
op.join(root, f"{split}.{lang}"), |
|
op.join(root, f"{split}.moses.{lang}"), |
|
src, |
|
tgt, |
|
) |
|
|
|
if bpe_size is not None: |
|
|
|
concated_train_path = op.join(root, "train.all") |
|
_concat_files( |
|
[op.join(root, "train.moses.fr"), op.join(root, "train.moses.en")], |
|
concated_train_path, |
|
) |
|
bpe_model_prefix = op.join(root, f"spm_bpe{bpe_size}") |
|
_get_bpe(concated_train_path, bpe_model_prefix, bpe_size) |
|
os.remove(concated_train_path) |
|
|
|
for lang in [src, tgt]: |
|
for split in SPLITS: |
|
_apply_bpe( |
|
bpe_model_prefix + ".model", |
|
op.join(root, f"{split}.moses.{lang}"), |
|
op.join(root, f"{split}.moses.bpe{bpe_size}.{lang}"), |
|
) |
|
|
|
if need_bytes: |
|
for lang in [src, tgt]: |
|
for split in SPLITS: |
|
_get_bytes( |
|
op.join(root, f"{split}.moses.{lang}"), |
|
op.join(root, f"{split}.moses.bytes.{lang}"), |
|
) |
|
|
|
if need_chars: |
|
for lang in [src, tgt]: |
|
for split in SPLITS: |
|
_get_chars( |
|
op.join(root, f"{split}.moses.{lang}"), |
|
op.join(root, f"{split}.moses.chars.{lang}"), |
|
) |
|
|
|
if bbpe_size is not None: |
|
|
|
bchar_path = op.join(root, "train.bchar") |
|
_convert_to_bchar(op.join(root, "train.moses"), src, tgt, bchar_path) |
|
bbpe_model_prefix = op.join(root, f"spm_bbpe{bbpe_size}") |
|
_get_bpe(bchar_path, bbpe_model_prefix, bbpe_size) |
|
os.remove(bchar_path) |
|
|
|
for lang in [src, tgt]: |
|
for split in SPLITS: |
|
_apply_bbpe( |
|
bbpe_model_prefix + ".model", |
|
op.join(root, f"{split}.moses.{lang}"), |
|
op.join(root, f"{split}.moses.bbpe{bbpe_size}.{lang}"), |
|
) |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--root", type=str, default="data") |
|
parser.add_argument( |
|
"--bpe-vocab", |
|
default=None, |
|
type=int, |
|
help="Generate tokenized bitext with BPE of size K." |
|
"Default to None (disabled).", |
|
) |
|
parser.add_argument( |
|
"--bbpe-vocab", |
|
default=None, |
|
type=int, |
|
help="Generate tokenized bitext with BBPE of size K." |
|
"Default to None (disabled).", |
|
) |
|
parser.add_argument( |
|
"--byte-vocab", |
|
action="store_true", |
|
help="Generate tokenized bitext with bytes vocabulary", |
|
) |
|
parser.add_argument( |
|
"--char-vocab", |
|
action="store_true", |
|
help="Generate tokenized bitext with chars vocabulary", |
|
) |
|
args = parser.parse_args() |
|
|
|
preprocess_iwslt17( |
|
args.root, |
|
"fr", |
|
"en", |
|
args.bpe_vocab, |
|
args.char_vocab, |
|
args.bbpe_vocab, |
|
args.byte_vocab, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|