#!/usr/bin/env bash # Copyright (c) 2019-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # set -e TOKENIZERS_SCRIPTS=tokenizers INSTALL_PATH=$TOKENIZERS_SCRIPTS/thirdparty N_THREADS=8 lg=$1 MOSES=$INSTALL_PATH/mosesdecoder REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl # special tokenization for Romanian WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts NORMALIZE_ROMANIAN=$WMT16_SCRIPTS/preprocess/normalise-romanian.py REMOVE_DIACRITICS=$WMT16_SCRIPTS/preprocess/remove-diacritics.py # Burmese MY_SEGMENT=$INSTALL_PATH/seg_my.py # Arabic AR_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenizer_ar.sh # Korean KO_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ko.sh # Japanese JA_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ja.sh # Indic IN_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_indic.py INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources # Thai THAI_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_thai.py # Chinese CHINESE_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_zh.py # Chinese if [ "$lg" = "zh" ]; then cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | python $CHINESE_TOKENIZER # Thai elif [ "$lg" = "th" ]; then cat - | python $THAI_TOKENIZER # Japanese elif [ "$lg" = "ja" ]; then cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | ${JA_SEGMENT} # Korean elif [ "$lg" = "ko" ]; then cat - | $REM_NON_PRINT_CHAR | ${KO_SEGMENT} # Romanian elif [ "$lg" = "ro" ]; then cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $NORMALIZE_ROMANIAN | $REMOVE_DIACRITICS | $TOKENIZER -no-escape -threads $N_THREADS -l $lg # Burmese elif [ "$lg" = "my" ]; then cat - | python ${MY_SEGMENT} # Arabic elif [ "$lg" = "ar" ]; then cat - | ${AR_TOKENIZER} # Indic elif [ "$lg" = "ne" ]; then cat - | python ${IN_TOKENIZER} $lg elif [ "$lg" = "si" ]; then cat - | python ${IN_TOKENIZER} $lg elif [ "$lg" = "hi" ]; then cat - | python ${IN_TOKENIZER} $lg # other languages else cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape -threads $N_THREADS -l $lg fi