File size: 1,386 Bytes
5833fc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/bin/bash
#
# USAGE preprocess.sh langid spmodel < input > output
#
# replace SPMENCODE with your own setup! 
#
# CHANGES
#
#  * issue with perl code that removes control characters
#    unicode property Other = \p{C}) seems to remove 
#    newline characters as well --> add negative lookahead
#    to avoid removing newline characters!
#

SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`

## simple pre-processing steps adapted from Moses tools

sed -e 's/,/,/g' \
    -e 's/。 */. /g' \
    -e 's/、/,/g' \
    -e 's/”/"/g' \
    -e 's/“/"/g' \
    -e 's/∶/:/g' \
    -e 's/:/:/g' \
    -e 's/?/\?/g' \
    -e 's/《/"/g' \
    -e 's/》/"/g' \
    -e 's/)/\)/g' \
    -e 's/!/\!/g' \
    -e 's/(/\(/g' \
    -e 's/;/;/g' \
    -e 's/1/"/g' \
    -e 's/」/"/g' \
    -e 's/「/"/g' \
    -e 's/0/0/g' \
    -e 's/3/3/g' \
    -e 's/2/2/g' \
    -e 's/5/5/g' \
    -e 's/6/6/g' \
    -e 's/9/9/g' \
    -e 's/7/7/g' \
    -e 's/8/8/g' \
    -e 's/4/4/g' \
    -e 's/. */. /g' \
    -e 's/~/\~/g' \
    -e "s/’/\'/g" \
    -e 's/…/\.\.\./g' \
    -e 's/━/\-/g' \
    -e 's/〈/\</g' \
    -e 's/〉/\>/g' \
    -e 's/【/\[/g' \
    -e 's/】/\]/g' \
    -e 's/%/\%/g' |    
perl -C -pe  's/(?!\n)\p{C}/ /g;' |
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
sed 's/  */ /g;s/^ *//g;s/ *$//g'