File size: 1,016 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
set -e

sep_collection_dir=all-mbert-mrtydi-corpus  # the dir to the combined collection 
merged_collection_dir=merged-mbert-mrtydi-corpus  # the dir to the combined collection 
index_dir=all-language-index-optimized

mkdir -p $sep_collection_dir
mkdir -p $merged_collection_dir

# download files
for lang in arabic  bengali  english finnish  indonesian  japanese  korean  russian  swahili  telugu  thai ; do
	echo "Downloading $lang corpus"
	lang_dir=$sep_collection_dir/$lang
	mkdir -p $lang_dir
	wget "https://huggingface.co/datasets/crystina-z/mbert-mrtydi-corpus/resolve/main/mr-tydi-v1.1-mbert-tokenize-$lang/corpus.jsonl.gz" -P $lang_dir
done

python scripts/mrtydi/combine_corpus.py -i $sep_collection_dir -o $merged_collection_dir

# index
python -m pyserini.index  \
    -collection MrTyDiCollection \
    -generator DefaultLuceneDocumentGenerator \
    -threads 12 \
    -input $merged_collection_dir \
    -index $index_dir \
    -storePositions -storeRaw -storeDocvectors \
    -pretokenized -optimize