File size: 3,315 Bytes
8437114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
source_lang=kk_KZ
target_lang=en_XX
MODEL=criss_checkpoints/criss.3rd.pt
SPM=criss_checkpoints/sentence.bpe.model
SPLIT=test
LANG_DICT=criss_checkpoints/lang_dict.txt
SPM_ENCODE=flores/scripts/spm_encode.py
SAVE_ENCODER=save_encoder.py
ENCODER_SAVE_ROOT=sentence_embeddings/$MODEL
DICT=criss_checkpoints/dict.txt
THRESHOLD=1.02
MIN_COUNT=500

DATA_DIR=data_tmp
SAVE_DIR=mining/${source_lang}_${target_lang}_mined
ENCODER_SAVE_DIR=${ENCODER_SAVE_ROOT}/${source_lang}-${target_lang}
INPUT_DIR=$DATA_DIR/${source_lang}-${target_lang}-tatoeba

mkdir -p $ENCODER_SAVE_DIR/${target_lang}
mkdir -p $ENCODER_SAVE_DIR/${source_lang}
mkdir -p $SAVE_DIR

## Save encoder outputs

# Save encoder outputs for source sentences
python $SAVE_ENCODER \
  ${INPUT_DIR} \
  --path ${MODEL} \
  --task translation_multi_simple_epoch \
  --lang-pairs ${source_lang}-${target_lang} \
  --lang-dict ${LANG_DICT} \
  --gen-subset ${SPLIT} \
  --bpe 'sentencepiece' \
  -s ${source_lang} -t ${target_lang} \
  --sentencepiece-model ${SPM} \
  --remove-bpe 'sentencepiece' \
  --beam 1 \
  --lang-tok-style mbart \
  --encoder-save-dir ${ENCODER_SAVE_DIR}/${source_lang}

## Save encoder outputs for target sentences
python $SAVE_ENCODER \
  ${INPUT_DIR} \
  --path ${MODEL} \
  --lang-pairs ${source_lang}-${target_lang} \
  --lang-dict ${LANG_DICT} \
  --task translation_multi_simple_epoch \
  --gen-subset ${SPLIT} \
  --bpe 'sentencepiece' \
  -t ${source_lang} -s ${target_lang} \
  --sentencepiece-model ${SPM} \
  --remove-bpe 'sentencepiece' \
  --beam 1 \
  --lang-tok-style mbart \
  --encoder-save-dir ${ENCODER_SAVE_DIR}/${target_lang}

## Mining
python mining/mine.py \
  --src-lang ${source_lang} \
  --tgt-lang ${target_lang} \
  --dim 1024 \
  --mem 10 \
  --neighborhood 4 \
  --src-dir ${ENCODER_SAVE_DIR}/${source_lang} \
  --tgt-dir ${ENCODER_SAVE_DIR}/${target_lang} \
  --output $SAVE_DIR \
  --threshold ${THRESHOLD} \
  --min-count ${MIN_COUNT} \
  --valid-size 100 \
  --dict-path ${DICT} \
  --spm-path ${SPM} \


## Process and binarize mined data
python $SPM_ENCODE \
  --model ${SPM} \
  --output_format=piece \
  --inputs mining/${source_lang}_${target_lang}_mined/train.${source_lang} mining/${source_lang}_${target_lang}_mined/train.${target_lang} \
  --outputs mining/${source_lang}_${target_lang}_mined/train.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/train.bpe.${target_lang}

python $SPM_ENCODE \
  --model ${SPM} \
  --output_format=piece \
  --inputs mining/${source_lang}_${target_lang}_mined/valid.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.${target_lang} \
  --outputs mining/${source_lang}_${target_lang}_mined/valid.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.bpe.${target_lang}


fairseq-preprocess \
  --source-lang ${source_lang} \
  --target-lang ${target_lang} \
  --trainpref mining/${source_lang}_${target_lang}_mined/train.bpe \
  --validpref mining/${source_lang}_${target_lang}_mined/valid.bpe \
  --destdir mining/${source_lang}_${target_lang}_mined \
  --srcdict ${DICT} \
  --joined-dictionary \
  --workers 8