#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. #echo 'Cloning Moses github repository (for tokenization scripts)...' #git clone https://github.com/moses-smt/mosesdecoder.git if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi data_root=${WORKDIR_ROOT}/iwsltv2 DESTDIR=${WORKDIR_ROOT}/ML50/raw langs="ar_AR it_IT nl_XX ko_KR vi_VN" echo "data_root: $data_root" download_path=${data_root}/downloads raw=${DESTDIR} tmp=${data_root}/tmp orig=${data_root}/orig mkdir -p $download_path $orig $raw $tmp ####################### download_iwslt(){ iwslt_key=$1 src=$2 tgt=$3 save_prefix=$4 pushd ${download_path} if [[ ! -f ${save_prefix}$src-$tgt.tgz ]]; then wget https://wit3.fbk.eu/archive/${iwslt_key}/texts/$src/$tgt/$src-$tgt.tgz -O ${save_prefix}$src-$tgt.tgz [ $? -eq 0 ] && return 0 fi popd } extract_iwslt(){ src=$1 tgt=$2 prefix=$3 pushd $orig tar zxvf ${download_path}/${prefix}$src-${tgt}.tgz popd } generate_train(){ lsrc=$1 ltgt=$2 src=${lsrc:0:2} tgt=${ltgt:0:2} for ll in $lsrc $ltgt; do l=${ll:0:2} f="$orig/*/train.tags.$src-$tgt.$l" f_raw=$raw/train.$lsrc-$ltgt.$ll cat $f \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | sed -e 's///g' \ | sed -e 's/<\/title>//g' \ | sed -e 's/<description>//g' \ | sed -e 's/<\/description>//g' \ | sed 's/^\s*//g' \ | sed 's/\s*$//g' \ > $f_raw [ $? -eq 0 ] && echo "extracted $f to $f_raw" done return 0 } convert_valid_test(){ src=$1 tgt=$2 for l in $src $tgt; do echo "lang: ${l}" for o in `ls $orig/*/IWSLT*.TED*.$src-$tgt.$l.xml`; do fname=${o##*/} f=$tmp/${fname%.*} echo "$o => $f" grep '<seg id' $o \ | sed -e 's/<seg id="[0-9]*">\s*//g' \ | sed -e 's/\s*<\/seg>\s*//g' \ | sed -e "s/\’/\'/g" \ > $f echo "" done done } generate_subset(){ lsrc=$1 ltgt=$2 src=${lsrc:0:2} tgt=${ltgt:0:2} subset=$3 prefix=$4 for ll in $lsrc $ltgt; do l=${ll:0:2} f=$tmp/$prefix.${src}-${tgt}.$l if [[ -f $f ]]; then cp $f $raw/$subset.${lsrc}-$ltgt.${ll} fi done } ################# echo "downloading iwslt training and dev data" # using multilingual for it, nl download_iwslt "2017-01-trnmted" DeEnItNlRo DeEnItNlRo download_iwslt "2017-01-trnted" ar en download_iwslt "2017-01-trnted" en ar download_iwslt "2017-01-trnted" ko en download_iwslt "2017-01-trnted" en ko download_iwslt "2015-01" vi en download_iwslt "2015-01" en vi echo "donwloading iwslt test data" download_iwslt "2017-01-mted-test" it en "test." download_iwslt "2017-01-mted-test" en it "test." download_iwslt "2017-01-mted-test" nl en "test." download_iwslt "2017-01-mted-test" en nl "test." download_iwslt "2017-01-ted-test" ar en "test." download_iwslt "2017-01-ted-test" en ar "test." download_iwslt "2017-01-ted-test" ko en "test." download_iwslt "2017-01-ted-test" en ko "test." download_iwslt "2015-01-test" vi en "test." download_iwslt "2015-01-test" en vi "test." echo "extract training data tar balls" extract_iwslt DeEnItNlRo DeEnItNlRo extract_iwslt ar en extract_iwslt en ar extract_iwslt ko en extract_iwslt en ko extract_iwslt vi en extract_iwslt en vi echo "extracting iwslt test data" for lang in $langs; do l=${lang:0:2} extract_iwslt $l en "test." extract_iwslt en $l "test." done echo "convert dev and test data" for lang in $langs; do s_lang=${lang:0:2} convert_valid_test $s_lang en convert_valid_test en $s_lang done echo "creating training data into $raw" for lang in $langs; do generate_train $lang en_XX generate_train en_XX $lang done echo "creating iwslt dev data into raw" generate_subset en_XX vi_VN valid "IWSLT15.TED.tst2013" generate_subset vi_VN en_XX valid "IWSLT15.TED.tst2013" generate_subset en_XX ar_AR valid "IWSLT17.TED.tst2016" generate_subset ar_AR en_XX valid "IWSLT17.TED.tst2016" generate_subset en_XX ko_KR valid "IWSLT17.TED.tst2016" generate_subset ko_KR en_XX valid "IWSLT17.TED.tst2016" generate_subset en_XX it_IT valid "IWSLT17.TED.tst2010" generate_subset it_IT en_XX valid "IWSLT17.TED.tst2010" generate_subset en_XX nl_XX valid "IWSLT17.TED.tst2010" generate_subset nl_XX en_XX valid "IWSLT17.TED.tst2010" echo "creating iswslt test data into raw" generate_subset en_XX vi_VN test "IWSLT15.TED.tst2015" generate_subset vi_VN en_XX test "IWSLT15.TED.tst2015" generate_subset en_XX ar_AR test "IWSLT17.TED.tst2017" generate_subset ar_AR en_XX test "IWSLT17.TED.tst2017" generate_subset en_XX ko_KR test "IWSLT17.TED.tst2017" generate_subset ko_KR en_XX test "IWSLT17.TED.tst2017" generate_subset en_XX it_IT test "IWSLT17.TED.tst2017.mltlng" generate_subset it_IT en_XX test "IWSLT17.TED.tst2017.mltlng" generate_subset en_XX nl_XX test "IWSLT17.TED.tst2017.mltlng" generate_subset nl_XX en_XX test "IWSLT17.TED.tst2017.mltlng" # normalze iwslt directions into x-en pushd $raw for lang in $langs; do for split in test valid; do x_en_f1=$split.$lang-en_XX.en_XX x_en_f2=$split.$lang-en_XX.${lang} en_x_f1=$split.en_XX-$lang.en_XX en_x_f2=$split.en_XX-$lang.${lang} if [ -f $en_x_f1 ] && [ ! -f $x_en_f1 ]; then echo "cp $en_x_f1 $x_en_f1" cp $en_x_f1 $x_en_f1 fi if [ -f $x_en_f2 ] && [ ! -f $x_en_f2 ]; then echo "cp $en_x_f2 $x_en_f2" cp $en_x_f2 $x_en_f2 fi done done popd