Spaces:

OFA-Sys
/

OFA-Visual_Grounding

Runtime error

App Files Files Community

OFA-Visual_Grounding / fairseq /examples /joint_alignment_translation /prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh

JustinLin610

update

10b0761 over 2 years ago

raw

history blame

3.34 kB

	#!/bin/bash

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	echo 'Cloning Moses github repository (for tokenization scripts)...'
	git clone https://github.com/moses-smt/mosesdecoder.git

	SCRIPTS=mosesdecoder/scripts
	TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
	CLEAN=$SCRIPTS/training/clean-corpus-n.perl
	REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl

	URLS=(
	"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
	"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
	"http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
	"http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
	"http://data.statmt.org/wmt17/translation-task/dev.tgz"
	"http://statmt.org/wmt14/test-full.tgz"
	)
	CORPORA=(
	"training/europarl-v7.de-en"
	"commoncrawl.de-en"
	"training-parallel-nc-v13/news-commentary-v13.de-en"
	"rapid2016.de-en"
	)

	if [ ! -d "$SCRIPTS" ]; then
	echo "Please set SCRIPTS variable correctly to point to Moses scripts."
	exit
	fi

	src=en
	tgt=de
	lang=en-de
	prep=wmt18_en_de
	tmp=$prep/tmp
	orig=orig
	dev=dev/newstest2012
	codes=32000
	bpe=bpe.32k

	mkdir -p $orig $tmp $prep $bpe

	cd $orig

	for ((i=0;i<${#URLS[@]};++i)); do
	url=${URLS[i]}
	file=$(basename $url)
	if [ -f $file ]; then
	echo "$file already exists, skipping download"
	else
	wget "$url"
	if [ -f $file ]; then
	echo "$url successfully downloaded."
	else
	echo "$url not successfully downloaded."
	exit 1
	fi
	if [ ${file: -4} == ".tgz" ]; then
	tar zxvf $file
	elif [ ${file: -4} == ".tar" ]; then
	tar xvf $file
	fi
	fi
	done
	cd ..

	echo "pre-processing train data..."
	for l in $src $tgt; do
	rm -rf $tmp/train.tags.$lang.tok.$l
	for f in "${CORPORA[@]}"; do
	cat $orig/$f.$l \| \
	perl $REM_NON_PRINT_CHAR \| \
	perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/train.tags.$lang.tok.$l
	done
	done

	echo "pre-processing test data..."
	for l in $src $tgt; do
	if [ "$l" == "$src" ]; then
	t="src"
	else
	t="ref"
	fi
	grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm \| \
	sed -e 's/<seg id="[0-9]">\s//g' \| \
	sed -e 's/\s<\/seg>\s//g' \| \
	sed -e "s/\’/\'/g" \| \
	perl $TOKENIZER -threads 8 -l $l -no-escape > $tmp/test.$l
	echo ""
	done

	# apply length filtering before BPE
	perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 100

	# use newstest2012 for valid
	echo "pre-processing valid data..."
	for l in $src $tgt; do
	rm -rf $tmp/valid.$l
	cat $orig/$dev.$l \| \
	perl $REM_NON_PRINT_CHAR \| \
	perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/valid.$l
	done

	mkdir output
	mv $tmp/{train,valid,test}.{$src,$tgt} output

	#BPE
	git clone https://github.com/glample/fastBPE.git
	pushd fastBPE
	g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
	popd
	fastBPE/fast learnbpe $codes output/train.$src output/train.$tgt > $bpe/codes
	for split in {train,valid,test}; do for lang in {en,de}; do fastBPE/fast applybpe $bpe/$split.$lang output/$split.$lang $bpe/codes; done; done