#!/bin/bash # # USAGE preprocess.sh langid spmodel < input > output # # replace SPMENCODE with your own setup! # # CHANGES # # * issue with perl code that removes control characters # unicode property Other = \p{C}) seems to remove # newline characters as well --> add negative lookahead # to avoid removing newline characters! # SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"` ## simple pre-processing steps adapted from Moses tools sed -e 's/,/,/g' \ -e 's/。 */. /g' \ -e 's/、/,/g' \ -e 's/”/"/g' \ -e 's/“/"/g' \ -e 's/∶/:/g' \ -e 's/:/:/g' \ -e 's/?/\?/g' \ -e 's/《/"/g' \ -e 's/》/"/g' \ -e 's/)/\)/g' \ -e 's/!/\!/g' \ -e 's/(/\(/g' \ -e 's/;/;/g' \ -e 's/1/"/g' \ -e 's/」/"/g' \ -e 's/「/"/g' \ -e 's/0/0/g' \ -e 's/3/3/g' \ -e 's/2/2/g' \ -e 's/5/5/g' \ -e 's/6/6/g' \ -e 's/9/9/g' \ -e 's/7/7/g' \ -e 's/8/8/g' \ -e 's/4/4/g' \ -e 's/. */. /g' \ -e 's/~/\~/g' \ -e "s/’/\'/g" \ -e 's/…/\.\.\./g' \ -e 's/━/\-/g' \ -e 's/〈/\/g' \ -e 's/【/\[/g' \ -e 's/】/\]/g' \ -e 's/%/\%/g' | perl -C -pe 's/(?!\n)\p{C}/ /g;' | perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\ sed 's/ */ /g;s/^ *//g;s/ *$//g'