NetsPresso_QA / scripts /trec-covid-ranker.py
geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
raw
history blame
10.4 kB
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os
import json
import sys
sys.path.append('..')
sys.path.append('../pyserini')
import subprocess
from enum import Enum
from pyserini.vectorizer import TfidfVectorizer
from pyserini.vectorizer import BM25Vectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from typing import List
from sklearn import preprocessing
from typing import List, Set
def normalize(scores):
low = min(scores)
high = max(scores)
width = high - low
return [(s-low)/width for s in scores]
def sort_dual_list(pred, docs):
zipped_lists = zip(pred, docs)
sorted_pairs = sorted(zipped_lists)
tuples = zip(*sorted_pairs)
pred, docs = [list(tuple) for tuple in tuples]
pred.reverse()
docs.reverse()
return pred, docs
def sort_str_topics_list(topics: List[str]) -> List[str]:
res = sorted([int(t) for t in topics])
return [str(t) for t in res]
def get_topics_from_qrun(path: str) -> Set[str]:
res = set()
with open(path, 'r') as f:
for line in f:
res.add(line.split()[0])
return sort_str_topics_list(res)
def get_lines_by_topic(path, topic, tag):
res = []
with open(path, 'r') as f:
for line in f:
tokens = line.split()
if tokens[0] != topic:
continue
tokens[-1] = tag
new_line = ' '.join(tokens)
res.append(new_line)
return res
def read_qrels(path: str):
qrels = []
with open(path, 'r') as f:
for line in f:
line = line.strip()
tokens = line.split()
topic = tokens[0]
doc_id = tokens[-2]
relevance = int(tokens[-1])
qrels.append({
'topic': topic,
'doc_id': doc_id,
'relevance': relevance
})
return qrels
def get_doc_to_id_from_qrun_by_topic(path: str, topic: str):
res = {}
with open(path, 'r') as f:
for line in f:
tokens = line.strip().split()
t = tokens[0]
if topic != t:
continue
doc_id = tokens[2]
score = float(tokens[-2])
res[doc_id] = score
return res
def get_docs_from_qrun_by_topic(path: str, topic: str):
x, y = [], []
with open(path, 'r') as f:
for line in f:
tokens = line.strip().split()
t = tokens[0]
if topic != t:
continue
doc_id = tokens[2]
score = float(tokens[-2])
x.append(doc_id)
y.append(score)
return x, y
def get_X_Y_from_qrels_by_topic(path: str, topic: str, R: List[int]):
# always include topic 0
R.append(0)
qrels = [qrel for qrel in read_qrels(path) if qrel['topic'] == topic and qrel['relevance'] in R]
x, y = [], []
for pack in qrels:
x.append(pack['doc_id'])
label = 0 if pack['relevance'] == 0 else 1
y.append(label)
return x, y
class SpecterVectorizer:
def __init__(self):
path = "data/specter.csv"
self.vectors = {}
with open(path, 'r') as f:
for line in f:
tokens = line.strip().split(',')
doc_id = tokens[0]
vector = [float(item) for item in tokens[1:]]
self.vectors[doc_id] = vector
def get_vectors(self, doc_ids: List[str]):
res = []
for doc_id in doc_ids:
if doc_id in self.vectors:
res.append(self.vectors[doc_id])
else:
print(f'{doc_id} not found')
return preprocessing.normalize(res)
class ClassifierType(Enum):
SVM = 'svm'
LR = 'lr'
NB = 'nb'
ClassifierStr = {
ClassifierType.SVM: 'svm',
ClassifierType.LR: 'lr',
ClassifierType.NB: 'nb',
}
class VectorizerType(Enum):
TFIDF = 'tfidf'
BM25 = 'bm25'
SPECTER = 'specter'
VectorizerStr = {
VectorizerType.TFIDF: 'tfidf',
VectorizerType.BM25: 'bm25',
VectorizerType.SPECTER: 'specter',
}
def evaluate(qrels_path: str, run_path: str, options: str = ''):
curdir = os.getcwd()
if curdir.endswith('clprf'):
anserini_root = '../../../anserini'
else:
anserini_root = '../anserini'
prefix = f"{anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -c -M1000 -m all_trec {qrels_path}"
cmd1 = f"{prefix} {run_path} {options} | grep 'ndcg_cut_20 '"
cmd2 = f"{prefix} {run_path} {options} | grep 'map '"
ndcg_score = str(subprocess.check_output(cmd1, shell=True)).split('\\t')[-1]
map_score = str(subprocess.check_output(cmd2, shell=True)).split('\\t')[-1]
return str(map_score),str(ndcg_score)
def rank(new_qrels: str, base: str,tmp_base:str, qrels_path: str, lucene_index_path: str, R: List[int], score_path: str, alpha: float, clf_type: ClassifierType, vec_type: VectorizerType, tag: str):
# build output path
base_str = base.split('/')[-1]
R_str = ''.join([str(i) for i in R])
curdir = os.getcwd()
if curdir.endswith('integrations'):
output_path = f'{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt'
else:
output_path = f'integrations/{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt'
print(f'Output -> {output_path}')
os.system('mkdir -p runs')
vectorizer = None
if vec_type == VectorizerType.TFIDF:
vectorizer = TfidfVectorizer(lucene_index_path, min_df=5)
elif vec_type == VectorizerType.SPECTER:
base += '.specter'
qrels_path += '.specter'
vectorizer = SpecterVectorizer()
elif vec_type == VectorizerType.BM25:
vectorizer = BM25Vectorizer(lucene_index_path, min_df=5)
else:
print('invalid vectorizer')
exit()
f = open(output_path, 'w+')
skipped_topics = set()
topics = get_topics_from_qrun(base)
for topic in topics:
train_docs, train_labels = get_X_Y_from_qrels_by_topic(qrels_path, topic, R)
if len(train_docs) == 0:
print(f'[topic][{topic}] skipped')
skipped_topics.add(topic)
continue
print(f'[topic][{topic}] eligible train docs {len(train_docs)}')
clf = None
if clf_type == ClassifierType.NB:
clf = MultinomialNB()
elif clf_type == ClassifierType.LR:
clf = LogisticRegression()
elif clf_type == ClassifierType.SVM:
clf = SVC(kernel='linear', probability=True)
else:
print('ClassifierType not supported')
exit()
train_vectors = vectorizer.get_vectors(train_docs)
clf.fit(train_vectors, train_labels)
test_docs, base_scores = get_docs_from_qrun_by_topic(base, topic)
print(f'[topic][{topic}] eligible test docs {len(test_docs)}')
test_vectors = vectorizer.get_vectors(test_docs)
rank_scores = clf.predict_proba(test_vectors)
rank_scores = [row[1] for row in rank_scores]
rank_scores = normalize(rank_scores)
base_scores = normalize(base_scores)
preds = [a * alpha + b * (1-alpha) for a, b in zip(rank_scores, base_scores)]
preds, docs = sort_dual_list(preds, test_docs)
for index, (score, doc_id) in enumerate(zip(preds, docs)):
rank = index + 1
f.write(f'{topic} Q0 {doc_id} {rank} {score} {tag}\n')
for topic in sort_str_topics_list(list(skipped_topics)):
lines = get_lines_by_topic(base, topic, tag)
print(f'Copying over skipped topic {topic} with {len(lines)} lines')
for line in lines:
f.write(f'{line}\n')
f.close()
map_score,ndcg_score = evaluate(new_qrels, output_path)
with open(score_path, 'w') as outfile:
json.dump({'map':map_score,'ndcg':ndcg_score}, outfile)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='use tfidf vectorizer on cord-19 dataset with ccrf technique')
parser.add_argument('-tag', type=str, default="interpolation",
metavar="tag_name", help='tag name for resulting Qrun')
parser.add_argument('-new_qrels', type=str, default="data/qrels-rnd1+2+3+4.txt",
metavar="path_to_new_qrels", help='path to new_qrels file')
parser.add_argument('-base', type=str, default="data/covidex.t5.final.txt",
metavar="path_to_base_run", help='path to base run')
parser.add_argument('-tmp_base', type=str, default="tmp101}",
metavar="tmp file folder name", help='"tmp file folder name')
parser.add_argument('-qrels', type=str, default="data/qrels-rnd1+2.txt",
metavar="path_to_qrels", help='path to qrels file')
parser.add_argument('-index', type=str, default="data/lucene-index-cord19-abstract-2020-05-19",
metavar="path_to_lucene_index", help='path to lucene index folder')
parser.add_argument('-output', type=str, default="data/output.json",
metavar="path_to_base_run", help='the path to map and ndcg scores')
parser.add_argument('-alpha', type=float, required=True, help='alpha value for interpolation')
parser.add_argument('-clf', type=ClassifierType, required=True, help='which classifier to use')
parser.add_argument('-vectorizer', type=VectorizerType, required=True, help='which vectorizer to use')
args = parser.parse_args()
R = [1, 2]
print('Using base run:', args.base)
rank(args.new_qrels, args.base, args.tmp_base, args.qrels, args.index, R, args.output, args.alpha, args.clf, args.vectorizer, args.tag)