File size: 2,291 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/bash

# path setting
CORPUS_DIR=/root/Corpus/mrtydi-korean/collection
CORPUS_PATH=${CORPUS_DIR}/docs.jsonl

INDEX_DIR=indexes/mrtydi-korean
mkdir -p $INDEX_DIR

# sparse indexing
lang=korean
abbr=ko
NUM_THREADS=16

# `target` directory not found
#echo "sparse (anserini version)"
#target/appassembler/bin/IndexCollection \
    #-collection MrTyDiCollection \
    #-input $CORPUS_DIR \
    #-index $INDEX_DIR/sparse_anserini \
    #-generator DefaultLuceneDocumentGenerator \
    #-threads $NUM_THREADS -storePositions -storeDocvectors -storeRaw -language $abbr

echo "sparse (pyserini version) ========================> SKIP ====================> "
#python -m pyserini.index.lucene \
  #--collection JsonCollection \
  #--input $CORPUS_DIR \
  #--index $INDEX_DIR/sparse_pyserini \
  #--generator DefaultLuceneDocumentGenerator \
  #--language $abbr \
  #--threads $NUM_THREADS \
  #--storePositions --storeDocvectors --storeRaw


# dense indexing
echo "dense"
export CUDA_VISIBLE_DEVICES=1
BATCH_SIZE=8
MAXLEN=512
ENCODER=castorini/mdpr-passage-nq 
python -m pyserini.encode   input   --corpus $CORPUS_PATH \
                                    --fields title text \
                                    --delimiter "\n\n" \
                            output  --embeddings  $INDEX_DIR/dense_maxlen$MAXLEN \
                                    --to-faiss \
                            encoder --encoder $ENCODER \
                                    --fields title text \
                                    --max-length $MAXLEN \
                                    --batch $BATCH_SIZE \
                                    --fp16

BATCH_SIZE=32
MAXLEN=256 # default
ENCODER=castorini/mdpr-passage-nq 
python -m pyserini.encode   input   --corpus $CORPUS_PATH \
                                    --fields title text \
                                    --delimiter "\n\n" \
                            output  --embeddings  $INDEX_DIR/dense_maxlen$MAXLEN \
                                    --to-faiss \
                            encoder --encoder $ENCODER \
                                    --fields title text \
                                    --max-length $MAXLEN \
                                    --batch $BATCH_SIZE \
                                    --fp16