OFA-OCR / fairseq /examples /multilingual /data_scripts /download_iwslt_and_extract.sh
JustinLin610's picture
first commit
ee21b96
raw
history blame
No virus
6.21 kB
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#echo 'Cloning Moses github repository (for tokenization scripts)...'
#git clone https://github.com/moses-smt/mosesdecoder.git
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
data_root=${WORKDIR_ROOT}/iwsltv2
DESTDIR=${WORKDIR_ROOT}/ML50/raw
langs="ar_AR it_IT nl_XX ko_KR vi_VN"
echo "data_root: $data_root"
download_path=${data_root}/downloads
raw=${DESTDIR}
tmp=${data_root}/tmp
orig=${data_root}/orig
mkdir -p $download_path $orig $raw $tmp
#######################
download_iwslt(){
iwslt_key=$1
src=$2
tgt=$3
save_prefix=$4
pushd ${download_path}
if [[ ! -f ${save_prefix}$src-$tgt.tgz ]]; then
wget https://wit3.fbk.eu/archive/${iwslt_key}/texts/$src/$tgt/$src-$tgt.tgz -O ${save_prefix}$src-$tgt.tgz
[ $? -eq 0 ] && return 0
fi
popd
}
extract_iwslt(){
src=$1
tgt=$2
prefix=$3
pushd $orig
tar zxvf ${download_path}/${prefix}$src-${tgt}.tgz
popd
}
generate_train(){
lsrc=$1
ltgt=$2
src=${lsrc:0:2}
tgt=${ltgt:0:2}
for ll in $lsrc $ltgt; do
l=${ll:0:2}
f="$orig/*/train.tags.$src-$tgt.$l"
f_raw=$raw/train.$lsrc-$ltgt.$ll
cat $f \
| grep -v '<url>' \
| grep -v '<talkid>' \
| grep -v '<keywords>' \
| grep -v '<speaker>' \
| grep -v '<reviewer' \
| grep -v '<translator' \
| grep -v '<doc' \
| grep -v '</doc>' \
| sed -e 's/<title>//g' \
| sed -e 's/<\/title>//g' \
| sed -e 's/<description>//g' \
| sed -e 's/<\/description>//g' \
| sed 's/^\s*//g' \
| sed 's/\s*$//g' \
> $f_raw
[ $? -eq 0 ] && echo "extracted $f to $f_raw"
done
return 0
}
convert_valid_test(){
src=$1
tgt=$2
for l in $src $tgt; do
echo "lang: ${l}"
for o in `ls $orig/*/IWSLT*.TED*.$src-$tgt.$l.xml`; do
fname=${o##*/}
f=$tmp/${fname%.*}
echo "$o => $f"
grep '<seg id' $o \
| sed -e 's/<seg id="[0-9]*">\s*//g' \
| sed -e 's/\s*<\/seg>\s*//g' \
| sed -e "s/\’/\'/g" \
> $f
echo ""
done
done
}
generate_subset(){
lsrc=$1
ltgt=$2
src=${lsrc:0:2}
tgt=${ltgt:0:2}
subset=$3
prefix=$4
for ll in $lsrc $ltgt; do
l=${ll:0:2}
f=$tmp/$prefix.${src}-${tgt}.$l
if [[ -f $f ]]; then
cp $f $raw/$subset.${lsrc}-$ltgt.${ll}
fi
done
}
#################
echo "downloading iwslt training and dev data"
# using multilingual for it, nl
download_iwslt "2017-01-trnmted" DeEnItNlRo DeEnItNlRo
download_iwslt "2017-01-trnted" ar en
download_iwslt "2017-01-trnted" en ar
download_iwslt "2017-01-trnted" ko en
download_iwslt "2017-01-trnted" en ko
download_iwslt "2015-01" vi en
download_iwslt "2015-01" en vi
echo "donwloading iwslt test data"
download_iwslt "2017-01-mted-test" it en "test."
download_iwslt "2017-01-mted-test" en it "test."
download_iwslt "2017-01-mted-test" nl en "test."
download_iwslt "2017-01-mted-test" en nl "test."
download_iwslt "2017-01-ted-test" ar en "test."
download_iwslt "2017-01-ted-test" en ar "test."
download_iwslt "2017-01-ted-test" ko en "test."
download_iwslt "2017-01-ted-test" en ko "test."
download_iwslt "2015-01-test" vi en "test."
download_iwslt "2015-01-test" en vi "test."
echo "extract training data tar balls"
extract_iwslt DeEnItNlRo DeEnItNlRo
extract_iwslt ar en
extract_iwslt en ar
extract_iwslt ko en
extract_iwslt en ko
extract_iwslt vi en
extract_iwslt en vi
echo "extracting iwslt test data"
for lang in $langs; do
l=${lang:0:2}
extract_iwslt $l en "test."
extract_iwslt en $l "test."
done
echo "convert dev and test data"
for lang in $langs; do
s_lang=${lang:0:2}
convert_valid_test $s_lang en
convert_valid_test en $s_lang
done
echo "creating training data into $raw"
for lang in $langs; do
generate_train $lang en_XX
generate_train en_XX $lang
done
echo "creating iwslt dev data into raw"
generate_subset en_XX vi_VN valid "IWSLT15.TED.tst2013"
generate_subset vi_VN en_XX valid "IWSLT15.TED.tst2013"
generate_subset en_XX ar_AR valid "IWSLT17.TED.tst2016"
generate_subset ar_AR en_XX valid "IWSLT17.TED.tst2016"
generate_subset en_XX ko_KR valid "IWSLT17.TED.tst2016"
generate_subset ko_KR en_XX valid "IWSLT17.TED.tst2016"
generate_subset en_XX it_IT valid "IWSLT17.TED.tst2010"
generate_subset it_IT en_XX valid "IWSLT17.TED.tst2010"
generate_subset en_XX nl_XX valid "IWSLT17.TED.tst2010"
generate_subset nl_XX en_XX valid "IWSLT17.TED.tst2010"
echo "creating iswslt test data into raw"
generate_subset en_XX vi_VN test "IWSLT15.TED.tst2015"
generate_subset vi_VN en_XX test "IWSLT15.TED.tst2015"
generate_subset en_XX ar_AR test "IWSLT17.TED.tst2017"
generate_subset ar_AR en_XX test "IWSLT17.TED.tst2017"
generate_subset en_XX ko_KR test "IWSLT17.TED.tst2017"
generate_subset ko_KR en_XX test "IWSLT17.TED.tst2017"
generate_subset en_XX it_IT test "IWSLT17.TED.tst2017.mltlng"
generate_subset it_IT en_XX test "IWSLT17.TED.tst2017.mltlng"
generate_subset en_XX nl_XX test "IWSLT17.TED.tst2017.mltlng"
generate_subset nl_XX en_XX test "IWSLT17.TED.tst2017.mltlng"
# normalze iwslt directions into x-en
pushd $raw
for lang in $langs; do
for split in test valid; do
x_en_f1=$split.$lang-en_XX.en_XX
x_en_f2=$split.$lang-en_XX.${lang}
en_x_f1=$split.en_XX-$lang.en_XX
en_x_f2=$split.en_XX-$lang.${lang}
if [ -f $en_x_f1 ] && [ ! -f $x_en_f1 ]; then
echo "cp $en_x_f1 $x_en_f1"
cp $en_x_f1 $x_en_f1
fi
if [ -f $x_en_f2 ] && [ ! -f $x_en_f2 ]; then
echo "cp $en_x_f2 $x_en_f2"
cp $en_x_f2 $x_en_f2
fi
done
done
popd