File size: 2,391 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env bash
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

set -e

TOKENIZERS_SCRIPTS=tokenizers
INSTALL_PATH=$TOKENIZERS_SCRIPTS/thirdparty

N_THREADS=8

lg=$1

MOSES=$INSTALL_PATH/mosesdecoder
REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl
NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl
TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl

# special tokenization for Romanian
WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts

NORMALIZE_ROMANIAN=$WMT16_SCRIPTS/preprocess/normalise-romanian.py
REMOVE_DIACRITICS=$WMT16_SCRIPTS/preprocess/remove-diacritics.py

# Burmese
MY_SEGMENT=$INSTALL_PATH/seg_my.py

# Arabic
AR_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenizer_ar.sh

# Korean
KO_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ko.sh

# Japanese
JA_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ja.sh

# Indic
IN_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_indic.py
INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources

# Thai
THAI_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_thai.py

# Chinese
CHINESE_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_zh.py

# Chinese
if [ "$lg" = "zh" ]; then
  cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | python $CHINESE_TOKENIZER
# Thai
elif [ "$lg" = "th" ]; then
  cat - | python $THAI_TOKENIZER
# Japanese
elif [ "$lg" = "ja" ]; then
  cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | ${JA_SEGMENT}
# Korean
elif [ "$lg" = "ko" ]; then
  cat - | $REM_NON_PRINT_CHAR | ${KO_SEGMENT}
# Romanian
elif [ "$lg" = "ro" ]; then
  cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $NORMALIZE_ROMANIAN | $REMOVE_DIACRITICS | $TOKENIZER -no-escape -threads $N_THREADS -l $lg
# Burmese
elif [ "$lg" = "my" ]; then
  cat - | python ${MY_SEGMENT}
# Arabic
elif [ "$lg" = "ar" ]; then
  cat - | ${AR_TOKENIZER}
# Indic
elif [ "$lg" = "ne" ]; then
  cat - | python ${IN_TOKENIZER} $lg
elif [ "$lg" = "si" ]; then
  cat - | python ${IN_TOKENIZER} $lg
elif [ "$lg" = "hi" ]; then
  cat - | python ${IN_TOKENIZER} $lg
# other languages
else
  cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape -threads $N_THREADS -l $lg
fi