Spaces:
Runtime error
Runtime error
# path setting | |
CORPUS_DIR=/root/Corpus/mrtydi-korean/collection | |
CORPUS_PATH=${CORPUS_DIR}/docs.jsonl | |
INDEX_DIR=indexes/mrtydi-korean | |
mkdir -p $INDEX_DIR | |
# sparse indexing | |
lang=korean | |
abbr=ko | |
NUM_THREADS=16 | |
# `target` directory not found | |
#echo "sparse (anserini version)" | |
#target/appassembler/bin/IndexCollection \ | |
#-collection MrTyDiCollection \ | |
#-input $CORPUS_DIR \ | |
#-index $INDEX_DIR/sparse_anserini \ | |
#-generator DefaultLuceneDocumentGenerator \ | |
#-threads $NUM_THREADS -storePositions -storeDocvectors -storeRaw -language $abbr | |
echo "sparse (pyserini version) ========================> SKIP ====================> " | |
#python -m pyserini.index.lucene \ | |
#--collection JsonCollection \ | |
#--input $CORPUS_DIR \ | |
#--index $INDEX_DIR/sparse_pyserini \ | |
#--generator DefaultLuceneDocumentGenerator \ | |
#--language $abbr \ | |
#--threads $NUM_THREADS \ | |
#--storePositions --storeDocvectors --storeRaw | |
# dense indexing | |
echo "dense" | |
export CUDA_VISIBLE_DEVICES=1 | |
BATCH_SIZE=8 | |
MAXLEN=512 | |
ENCODER=castorini/mdpr-passage-nq | |
python -m pyserini.encode input --corpus $CORPUS_PATH \ | |
--fields title text \ | |
--delimiter "\n\n" \ | |
output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \ | |
--to-faiss \ | |
encoder --encoder $ENCODER \ | |
--fields title text \ | |
--max-length $MAXLEN \ | |
--batch $BATCH_SIZE \ | |
--fp16 | |
BATCH_SIZE=32 | |
MAXLEN=256 # default | |
ENCODER=castorini/mdpr-passage-nq | |
python -m pyserini.encode input --corpus $CORPUS_PATH \ | |
--fields title text \ | |
--delimiter "\n\n" \ | |
output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \ | |
--to-faiss \ | |
encoder --encoder $ENCODER \ | |
--fields title text \ | |
--max-length $MAXLEN \ | |
--batch $BATCH_SIZE \ | |
--fp16 |