NetsPresso_QA / scripts /mrtydi /index-combined-corpus.sh
geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
raw
history blame
No virus
1.02 kB
set -e
sep_collection_dir=all-mbert-mrtydi-corpus # the dir to the combined collection
merged_collection_dir=merged-mbert-mrtydi-corpus # the dir to the combined collection
index_dir=all-language-index-optimized
mkdir -p $sep_collection_dir
mkdir -p $merged_collection_dir
# download files
for lang in arabic bengali english finnish indonesian japanese korean russian swahili telugu thai ; do
echo "Downloading $lang corpus"
lang_dir=$sep_collection_dir/$lang
mkdir -p $lang_dir
wget "https://huggingface.co/datasets/crystina-z/mbert-mrtydi-corpus/resolve/main/mr-tydi-v1.1-mbert-tokenize-$lang/corpus.jsonl.gz" -P $lang_dir
done
python scripts/mrtydi/combine_corpus.py -i $sep_collection_dir -o $merged_collection_dir
# index
python -m pyserini.index \
-collection MrTyDiCollection \
-generator DefaultLuceneDocumentGenerator \
-threads 12 \
-input $merged_collection_dir \
-index $index_dir \
-storePositions -storeRaw -storeDocvectors \
-pretokenized -optimize