|
#!/bin/bash |
|
|
|
if [ $# -ne 5 ]; then |
|
echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]" |
|
exit |
|
fi |
|
|
|
|
|
DATASET=$1 |
|
LANGPAIR=$2 |
|
DATABIN=$3 |
|
BPECODE=$4 |
|
MODEL=$5 |
|
|
|
SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1) |
|
TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2) |
|
|
|
|
|
BPEROOT=examples/backtranslation/subword-nmt/subword_nmt |
|
if [ ! -e $BPEROOT ]; then |
|
BPEROOT=subword-nmt/subword_nmt |
|
if [ ! -e $BPEROOT ]; then |
|
echo 'Cloning Subword NMT repository (for BPE pre-processing)...' |
|
git clone https://github.com/rsennrich/subword-nmt.git |
|
fi |
|
fi |
|
|
|
|
|
sacrebleu -t $DATASET -l $LANGPAIR --echo src \ |
|
| sacremoses tokenize -a -l $SRCLANG -q \ |
|
| python $BPEROOT/apply_bpe.py -c $BPECODE \ |
|
| fairseq-interactive $DATABIN --path $MODEL \ |
|
-s $SRCLANG -t $TGTLANG \ |
|
--beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \ |
|
| grep ^H- | cut -f 3- \ |
|
| sacremoses detokenize -l $TGTLANG -q \ |
|
| sacrebleu -t $DATASET -l $LANGPAIR |
|
|