10zinten's picture
Duplicate from openpecha/tibetan-aligner-api
1a3c007
#!/bin/bash
number_of_overlays=6 # the higher the number of overlays, the more precise alignment is going to be, but also slower
deletion=0.06 # higher = less precise
search_buffer_size=50
# Args:
# first parameter is a file in Tibetan unicode
# second parameter is a file with English in plain text.
# third parameter is output path
cp $1 $1.work
cp $2 $2.work
output_dir=${3:-"output"}
mkdir $output_dir
cp $2.work $2.work2
echo '[INFO] Getting Embedding...'
time python get_vectors.py $1.work $number_of_overlays
time python get_vectors.py $2.work $number_of_overlays
rm ladder
echo '[INFO] Running alignment...'
time ./vecalign.py -a $number_of_overlays -d $deletion --search_buffer_size $search_buffer_size --alignment_max_size $number_of_overlays --src $1.work --tgt $2.work \
--src_embed $1.work_overlay $1.work_vectors.npy \
--tgt_embed $2.work_overlay $2.work_vectors.npy >> ladder
rm $1.org
rm $1.train
python ladder2org.py $1.work $2.work ladder >> $1.org
python create_train.py $1.work $2.work ladder >> $1.train
python create_train_clean.py $1.work $2.work ladder >> $1.train_cleaned
# clean up
mv *.txt* $output_dir/
mv $output_dir/requirements.txt ./
rm $output_dir/$1.work
rm $output_dir/$2.work
rm $output_dir/$2.work2
rm $output_dir/$1.work_vectors.npy
rm $output_dir/$2.work_vectors.npy
echo "[OUTPUT] $output_dir/$1.train_cleaned"