Spaces:
Runtime error
Runtime error
set -e | |
sep_collection_dir=all-mbert-mrtydi-corpus # the dir to the combined collection | |
merged_collection_dir=merged-mbert-mrtydi-corpus # the dir to the combined collection | |
index_dir=all-language-index-optimized | |
mkdir -p $sep_collection_dir | |
mkdir -p $merged_collection_dir | |
# download files | |
for lang in arabic bengali english finnish indonesian japanese korean russian swahili telugu thai ; do | |
echo "Downloading $lang corpus" | |
lang_dir=$sep_collection_dir/$lang | |
mkdir -p $lang_dir | |
wget "https://huggingface.co/datasets/crystina-z/mbert-mrtydi-corpus/resolve/main/mr-tydi-v1.1-mbert-tokenize-$lang/corpus.jsonl.gz" -P $lang_dir | |
done | |
python scripts/mrtydi/combine_corpus.py -i $sep_collection_dir -o $merged_collection_dir | |
# index | |
python -m pyserini.index \ | |
-collection MrTyDiCollection \ | |
-generator DefaultLuceneDocumentGenerator \ | |
-threads 12 \ | |
-input $merged_collection_dir \ | |
-index $index_dir \ | |
-storePositions -storeRaw -storeDocvectors \ | |
-pretokenized -optimize | |