File size: 845 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/bin/bash

#SHARD_NUM=8
SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"

START_IDX=0
END_IDX=1
SHARD_NUM=8
GPU_ID=0

for SHARD_IDX in $(seq $START_IDX $END_IDX); do
        echo "encoding for SHARD_IDX = $SHARD_IDX"
        python -m pyserini.encode \
        input   --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
                --fields text \
                --shard-id $SHARD_IDX \
                --shard-num $SHARD_NUM \
        output  --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
                --to-faiss \
        encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
                --fields text \
                --device cuda:$GPU_ID \
                --batch 128 \
                --fp16
done