#!/bin/bash # path setting CORPUS_DIR=/root/Corpus/mrtydi-korean/collection CORPUS_PATH=${CORPUS_DIR}/docs.jsonl INDEX_DIR=indexes/mrtydi-korean mkdir -p $INDEX_DIR # sparse indexing lang=korean abbr=ko NUM_THREADS=16 # `target` directory not found #echo "sparse (anserini version)" #target/appassembler/bin/IndexCollection \ #-collection MrTyDiCollection \ #-input $CORPUS_DIR \ #-index $INDEX_DIR/sparse_anserini \ #-generator DefaultLuceneDocumentGenerator \ #-threads $NUM_THREADS -storePositions -storeDocvectors -storeRaw -language $abbr echo "sparse (pyserini version) ========================> SKIP ====================> " #python -m pyserini.index.lucene \ #--collection JsonCollection \ #--input $CORPUS_DIR \ #--index $INDEX_DIR/sparse_pyserini \ #--generator DefaultLuceneDocumentGenerator \ #--language $abbr \ #--threads $NUM_THREADS \ #--storePositions --storeDocvectors --storeRaw # dense indexing echo "dense" export CUDA_VISIBLE_DEVICES=1 BATCH_SIZE=8 MAXLEN=512 ENCODER=castorini/mdpr-passage-nq python -m pyserini.encode input --corpus $CORPUS_PATH \ --fields title text \ --delimiter "\n\n" \ output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \ --to-faiss \ encoder --encoder $ENCODER \ --fields title text \ --max-length $MAXLEN \ --batch $BATCH_SIZE \ --fp16 BATCH_SIZE=32 MAXLEN=256 # default ENCODER=castorini/mdpr-passage-nq python -m pyserini.encode input --corpus $CORPUS_PATH \ --fields title text \ --delimiter "\n\n" \ output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \ --to-faiss \ encoder --encoder $ENCODER \ --fields title text \ --max-length $MAXLEN \ --batch $BATCH_SIZE \ --fp16