Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # | |
| source_lang=kk_KZ | |
| target_lang=en_XX | |
| MODEL=criss_checkpoints/criss.3rd.pt | |
| SPM=criss_checkpoints/sentence.bpe.model | |
| SPLIT=test | |
| LANG_DICT=criss_checkpoints/lang_dict.txt | |
| SPM_ENCODE=flores/scripts/spm_encode.py | |
| SAVE_ENCODER=save_encoder.py | |
| ENCODER_SAVE_ROOT=sentence_embeddings/$MODEL | |
| DICT=criss_checkpoints/dict.txt | |
| THRESHOLD=1.02 | |
| MIN_COUNT=500 | |
| DATA_DIR=data_tmp | |
| SAVE_DIR=mining/${source_lang}_${target_lang}_mined | |
| ENCODER_SAVE_DIR=${ENCODER_SAVE_ROOT}/${source_lang}-${target_lang} | |
| INPUT_DIR=$DATA_DIR/${source_lang}-${target_lang}-tatoeba | |
| mkdir -p $ENCODER_SAVE_DIR/${target_lang} | |
| mkdir -p $ENCODER_SAVE_DIR/${source_lang} | |
| mkdir -p $SAVE_DIR | |
| ## Save encoder outputs | |
| # Save encoder outputs for source sentences | |
| python $SAVE_ENCODER \ | |
| ${INPUT_DIR} \ | |
| --path ${MODEL} \ | |
| --task translation_multi_simple_epoch \ | |
| --lang-pairs ${source_lang}-${target_lang} \ | |
| --lang-dict ${LANG_DICT} \ | |
| --gen-subset ${SPLIT} \ | |
| --bpe 'sentencepiece' \ | |
| -s ${source_lang} -t ${target_lang} \ | |
| --sentencepiece-model ${SPM} \ | |
| --remove-bpe 'sentencepiece' \ | |
| --beam 1 \ | |
| --lang-tok-style mbart \ | |
| --encoder-save-dir ${ENCODER_SAVE_DIR}/${source_lang} | |
| ## Save encoder outputs for target sentences | |
| python $SAVE_ENCODER \ | |
| ${INPUT_DIR} \ | |
| --path ${MODEL} \ | |
| --lang-pairs ${source_lang}-${target_lang} \ | |
| --lang-dict ${LANG_DICT} \ | |
| --task translation_multi_simple_epoch \ | |
| --gen-subset ${SPLIT} \ | |
| --bpe 'sentencepiece' \ | |
| -t ${source_lang} -s ${target_lang} \ | |
| --sentencepiece-model ${SPM} \ | |
| --remove-bpe 'sentencepiece' \ | |
| --beam 1 \ | |
| --lang-tok-style mbart \ | |
| --encoder-save-dir ${ENCODER_SAVE_DIR}/${target_lang} | |
| ## Mining | |
| python mining/mine.py \ | |
| --src-lang ${source_lang} \ | |
| --tgt-lang ${target_lang} \ | |
| --dim 1024 \ | |
| --mem 10 \ | |
| --neighborhood 4 \ | |
| --src-dir ${ENCODER_SAVE_DIR}/${source_lang} \ | |
| --tgt-dir ${ENCODER_SAVE_DIR}/${target_lang} \ | |
| --output $SAVE_DIR \ | |
| --threshold ${THRESHOLD} \ | |
| --min-count ${MIN_COUNT} \ | |
| --valid-size 100 \ | |
| --dict-path ${DICT} \ | |
| --spm-path ${SPM} \ | |
| ## Process and binarize mined data | |
| python $SPM_ENCODE \ | |
| --model ${SPM} \ | |
| --output_format=piece \ | |
| --inputs mining/${source_lang}_${target_lang}_mined/train.${source_lang} mining/${source_lang}_${target_lang}_mined/train.${target_lang} \ | |
| --outputs mining/${source_lang}_${target_lang}_mined/train.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/train.bpe.${target_lang} | |
| python $SPM_ENCODE \ | |
| --model ${SPM} \ | |
| --output_format=piece \ | |
| --inputs mining/${source_lang}_${target_lang}_mined/valid.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.${target_lang} \ | |
| --outputs mining/${source_lang}_${target_lang}_mined/valid.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.bpe.${target_lang} | |
| fairseq-preprocess \ | |
| --source-lang ${source_lang} \ | |
| --target-lang ${target_lang} \ | |
| --trainpref mining/${source_lang}_${target_lang}_mined/train.bpe \ | |
| --validpref mining/${source_lang}_${target_lang}_mined/valid.bpe \ | |
| --destdir mining/${source_lang}_${target_lang}_mined \ | |
| --srcdict ${DICT} \ | |
| --joined-dictionary \ | |
| --workers 8 | |