JustinLin610's picture
first commit
ee21b96
raw
history blame
No virus
4.37 kB
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# set -x -e
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
# put intermediate files
TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2
# output {train,valid,test} files to dest
DEST=${WORKDIR_ROOT}/ML50/raw
ROOT=${WORKDIR_ROOT}
UTILS=$PWD/utils
TMX2CORPUS="${UTILS}/tmx2corpus"
TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py"
mkdir -p $TMP_DIR
mkdir -p $DEST
mkdir -p $UTILS
function download_opus(){
src=$1
tgt=$2
subset=$3
ulr=$4
mkdir extract_$subset.$src-$tgt
pushd extract_$subset.$src-$tgt
if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then
wget $url -O "$subset.$src-$tgt.tmx.gz"
gzip -d "$subset.$src-$tgt.tmx.gz"
f=$subset.$src-$tgt.tmx
$TMX_TOOL $f
mv bitext.$src ../$subset.$src-$tgt.$src
mv bitext.$tgt ../$subset.$src-$tgt.$tgt
fi
popd
}
function concat_subsets(){
src=$1
tgt=$2
subsets=$3
src_train=raw_train.$src-$tgt.$src
tgt_train=raw_train.$src-$tgt.$tgt
> $src_train
> $tgt_train
for subset in $subsets; do
cat $subset.$src-$tgt.$src >> $src_train
cat $subset.$src-$tgt.$tgt >> $tgt_train
done
}
function get_seeded_random()
{
seed="$1"
openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
</dev/zero 2>/dev/null
}
function split_train_valid(){
src=$1
tgt=$2
raw_src_train=raw_train.$src-$tgt.$src
raw_tgt_train=raw_train.$src-$tgt.$tgt
shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src
shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt
head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src
head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt
tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src
tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt
}
function copy2dst(){
lsrc=$1
ltgt=$2
src=${lsrc:0:2}
tgt=${ltgt:0:2}
cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc
cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt
cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc
cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt
}
#for xh-en
declare -A xh_en_urls
xh_en_urls=(
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz
[wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz
[memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz
[XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz
)
mkdir $TMP_DIR/xh-en
pushd $TMP_DIR/xh-en
for k in "${!xh_en_urls[@]}"
do
name=$k
url=${xh_en_urls[$k]}
echo "$name: $url"
download_opus xh en $name $ulr
done
concat_subsets xh en "${!xh_en_urls[@]}"
split_train_valid xh en
copy2dst xh_ZA en_XX
popd
##
#for af-en
declare -A af_en_urls
af_en_urls=(
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz
[QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz
[OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz
[SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz
)
mkdir $TMP_DIR/af-en
pushd $TMP_DIR/af-en
for k in "${!af_en_urls[@]}"
do
name=$k
url=${af_en_urls[$k]}
echo "$name: $url"
download_opus af en $name $ulr
done
concat_subsets af en "${!af_en_urls[@]}"
split_train_valid af en
copy2dst af_ZA en_XX
popd