#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # set -x -e if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi # put intermediate files TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2 # output {train,valid,test} files to dest DEST=${WORKDIR_ROOT}/ML50/raw ROOT=${WORKDIR_ROOT} UTILS=$PWD/utils TMX2CORPUS="${UTILS}/tmx2corpus" TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py" mkdir -p $TMP_DIR mkdir -p $DEST mkdir -p $UTILS function download_opus(){ src=$1 tgt=$2 subset=$3 ulr=$4 mkdir extract_$subset.$src-$tgt pushd extract_$subset.$src-$tgt if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then wget $url -O "$subset.$src-$tgt.tmx.gz" gzip -d "$subset.$src-$tgt.tmx.gz" f=$subset.$src-$tgt.tmx $TMX_TOOL $f mv bitext.$src ../$subset.$src-$tgt.$src mv bitext.$tgt ../$subset.$src-$tgt.$tgt fi popd } function concat_subsets(){ src=$1 tgt=$2 subsets=$3 src_train=raw_train.$src-$tgt.$src tgt_train=raw_train.$src-$tgt.$tgt > $src_train > $tgt_train for subset in $subsets; do cat $subset.$src-$tgt.$src >> $src_train cat $subset.$src-$tgt.$tgt >> $tgt_train done } function get_seeded_random() { seed="$1" openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \ /dev/null } function split_train_valid(){ src=$1 tgt=$2 raw_src_train=raw_train.$src-$tgt.$src raw_tgt_train=raw_train.$src-$tgt.$tgt shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt } function copy2dst(){ lsrc=$1 ltgt=$2 src=${lsrc:0:2} tgt=${ltgt:0:2} cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt } #for xh-en declare -A xh_en_urls xh_en_urls=( [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz [wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz [memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz [XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz ) mkdir $TMP_DIR/xh-en pushd $TMP_DIR/xh-en for k in "${!xh_en_urls[@]}" do name=$k url=${xh_en_urls[$k]} echo "$name: $url" download_opus xh en $name $ulr done concat_subsets xh en "${!xh_en_urls[@]}" split_train_valid xh en copy2dst xh_ZA en_XX popd ## #for af-en declare -A af_en_urls af_en_urls=( [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz [QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz [OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz [SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz ) mkdir $TMP_DIR/af-en pushd $TMP_DIR/af-en for k in "${!af_en_urls[@]}" do name=$k url=${af_en_urls[$k]} echo "$name: $url" download_opus af en $name $ulr done concat_subsets af en "${!af_en_urls[@]}" split_train_valid af en copy2dst af_ZA en_XX popd