Spaces:
Runtime error
Runtime error
# | |
# Pyserini: Reproducible IR research with sparse and dense representations | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
import argparse | |
from tqdm import tqdm | |
import numpy as np | |
import pandas as pd | |
from pyserini.query_iterator import DefaultQueryIterator | |
from pyserini.encode import DprQueryEncoder, TctColBertQueryEncoder, AnceQueryEncoder, AutoQueryEncoder | |
from pyserini.encode import UniCoilQueryEncoder, SpladeQueryEncoder | |
def init_encoder(encoder, device): | |
if 'dpr' in encoder.lower(): | |
return DprQueryEncoder(encoder, device=device) | |
elif 'tct' in encoder.lower(): | |
return TctColBertQueryEncoder(encoder, device=device) | |
elif 'ance' in encoder.lower(): | |
return AnceQueryEncoder(encoder, device=device, tokenizer_name='roberta-base') | |
elif 'sentence-transformers' in encoder.lower(): | |
return AutoQueryEncoder(encoder, device=device, pooling='mean', l2_norm=True) | |
elif 'unicoil' in encoder.lower(): | |
return UniCoilQueryEncoder(encoder, device=device) | |
elif 'splade' in encoder.lower(): | |
return SpladeQueryEncoder(encoder, device=device) | |
else: | |
return AutoQueryEncoder(encoder, device=device) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--topics', type=str, | |
help='path to topics file in tsv format or self-contained topics name', required=True) | |
parser.add_argument('--encoder', type=str, help='encoder model name or path', required=True) | |
parser.add_argument('--weight-range', type=int, help='range of weights for sparse embedding', required=False) | |
parser.add_argument('--quant-range', type=int, help='range of quantization for sparse embedding', required=False) | |
parser.add_argument('--output', type=str, help='path to stored encoded queries', required=True) | |
parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', | |
default='cpu', required=False) | |
args = parser.parse_args() | |
encoder = init_encoder(args.encoder, device=args.device) | |
query_iterator = DefaultQueryIterator.from_topics(args.topics) | |
is_sparse = False | |
query_ids = [] | |
query_texts = [] | |
query_embeddings = [] | |
for topic_id, text in tqdm(query_iterator): | |
embedding = encoder.encode(text) | |
if isinstance(embedding, dict): | |
is_sparse = True | |
pseudo_str = [] | |
for tok, weight in embedding.items(): | |
weight_quanted = int(np.round(weight/args.weight_range*args.quant_range)) | |
pseudo_str += [tok] * weight_quanted | |
pseudo_str = " ".join(pseudo_str) | |
embedding = pseudo_str | |
query_ids.append(topic_id) | |
query_texts.append(text) | |
query_embeddings.append(embedding) | |
if is_sparse: | |
with open(args.output, 'w') as f: | |
for i in range(len(query_ids)): | |
f.write(f"{query_ids[i]}\t{query_embeddings[i]}\n") | |
else: | |
embeddings = {'id': query_ids, 'text': query_texts, 'embedding': query_embeddings} | |
embeddings = pd.DataFrame(embeddings) | |
embeddings.to_pickle(args.output) | |