#!/usr/bin/env python3 # # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys import sacremoses def main(args): """Tokenizes, preserving tabs""" mt = sacremoses.MosesTokenizer(lang=args.lang) def tok(s): return mt.tokenize(s, return_str=True) for line in sys.stdin: parts = list(map(tok, line.split("\t"))) print(*parts, sep="\t", flush=True) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--lang", "-l", default="en") parser.add_argument("--penn", "-p", action="store_true") parser.add_argument("--fields", "-f", help="fields to tokenize") args = parser.parse_args() main(args)