Spaces:
Running
Running
#!/usr/bin/env python3 | |
# | |
# Copyright (c) Facebook, Inc. and its affiliates. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import sys | |
import sacremoses | |
def main(args): | |
"""Tokenizes, preserving tabs""" | |
mt = sacremoses.MosesTokenizer(lang=args.lang) | |
def tok(s): | |
return mt.tokenize(s, return_str=True) | |
for line in sys.stdin: | |
parts = list(map(tok, line.split("\t"))) | |
print(*parts, sep="\t", flush=True) | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--lang", "-l", default="en") | |
parser.add_argument("--penn", "-p", action="store_true") | |
parser.add_argument("--fields", "-f", help="fields to tokenize") | |
args = parser.parse_args() | |
main(args) | |