#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. SRCS=( "de" "fr" ) TGT=en ROOT=$(dirname "$0") SCRIPTS=$ROOT/../../scripts SPM_TRAIN=$SCRIPTS/spm_train.py SPM_ENCODE=$SCRIPTS/spm_encode.py BPESIZE=16384 ORIG=$ROOT/iwslt17_orig DATA=$ROOT/iwslt17.de_fr.en.bpe16k mkdir -p "$ORIG" "$DATA" TRAIN_MINLEN=1 # remove sentences with <1 BPE token TRAIN_MAXLEN=250 # remove sentences with >250 BPE tokens URLS=( "https://wit3.fbk.eu/archive/2017-01-trnted/texts/de/en/de-en.tgz" "https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz" ) ARCHIVES=( "de-en.tgz" "fr-en.tgz" ) VALID_SETS=( "IWSLT17.TED.dev2010.de-en IWSLT17.TED.tst2010.de-en IWSLT17.TED.tst2011.de-en IWSLT17.TED.tst2012.de-en IWSLT17.TED.tst2013.de-en IWSLT17.TED.tst2014.de-en IWSLT17.TED.tst2015.de-en" "IWSLT17.TED.dev2010.fr-en IWSLT17.TED.tst2010.fr-en IWSLT17.TED.tst2011.fr-en IWSLT17.TED.tst2012.fr-en IWSLT17.TED.tst2013.fr-en IWSLT17.TED.tst2014.fr-en IWSLT17.TED.tst2015.fr-en" ) # download and extract data for ((i=0;i<${#URLS[@]};++i)); do ARCHIVE=$ORIG/${ARCHIVES[i]} if [ -f "$ARCHIVE" ]; then echo "$ARCHIVE already exists, skipping download" else URL=${URLS[i]} wget -P "$ORIG" "$URL" if [ -f "$ARCHIVE" ]; then echo "$URL successfully downloaded." else echo "$URL not successfully downloaded." exit 1 fi fi FILE=${ARCHIVE: -4} if [ -e "$FILE" ]; then echo "$FILE already exists, skipping extraction" else tar -C "$ORIG" -xzvf "$ARCHIVE" fi done echo "pre-processing train data..." for SRC in "${SRCS[@]}"; do for LANG in "${SRC}" "${TGT}"; do cat "$ORIG/${SRC}-${TGT}/train.tags.${SRC}-${TGT}.${LANG}" \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | sed -e 's///g' \ | sed -e 's/<\/title>//g' \ | sed -e 's/<description>//g' \ | sed -e 's/<\/description>//g' \ | sed 's/^\s*//g' \ | sed 's/\s*$//g' \ > "$DATA/train.${SRC}-${TGT}.${LANG}" done done echo "pre-processing valid data..." for ((i=0;i<${#SRCS[@]};++i)); do SRC=${SRCS[i]} VALID_SET=(${VALID_SETS[i]}) for ((j=0;j<${#VALID_SET[@]};++j)); do FILE=${VALID_SET[j]} for LANG in "$SRC" "$TGT"; do grep '<seg id' "$ORIG/${SRC}-${TGT}/${FILE}.${LANG}.xml" \ | sed -e 's/<seg id="[0-9]*">\s*//g' \ | sed -e 's/\s*<\/seg>\s*//g' \ | sed -e "s/\’/\'/g" \ > "$DATA/valid${j}.${SRC}-${TGT}.${LANG}" done done done # learn BPE with sentencepiece TRAIN_FILES=$(for SRC in "${SRCS[@]}"; do echo $DATA/train.${SRC}-${TGT}.${SRC}; echo $DATA/train.${SRC}-${TGT}.${TGT}; done | tr "\n" ",") echo "learning joint BPE over ${TRAIN_FILES}..." python "$SPM_TRAIN" \ --input=$TRAIN_FILES \ --model_prefix=$DATA/sentencepiece.bpe \ --vocab_size=$BPESIZE \ --character_coverage=1.0 \ --model_type=bpe # encode train/valid echo "encoding train with learned BPE..." for SRC in "${SRCS[@]}"; do python "$SPM_ENCODE" \ --model "$DATA/sentencepiece.bpe.model" \ --output_format=piece \ --inputs $DATA/train.${SRC}-${TGT}.${SRC} $DATA/train.${SRC}-${TGT}.${TGT} \ --outputs $DATA/train.bpe.${SRC}-${TGT}.${SRC} $DATA/train.bpe.${SRC}-${TGT}.${TGT} \ --min-len $TRAIN_MINLEN --max-len $TRAIN_MAXLEN done echo "encoding valid with learned BPE..." for ((i=0;i<${#SRCS[@]};++i)); do SRC=${SRCS[i]} VALID_SET=(${VALID_SETS[i]}) for ((j=0;j<${#VALID_SET[@]};++j)); do python "$SPM_ENCODE" \ --model "$DATA/sentencepiece.bpe.model" \ --output_format=piece \ --inputs $DATA/valid${j}.${SRC}-${TGT}.${SRC} $DATA/valid${j}.${SRC}-${TGT}.${TGT} \ --outputs $DATA/valid${j}.bpe.${SRC}-${TGT}.${SRC} $DATA/valid${j}.bpe.${SRC}-${TGT}.${TGT} done done