Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / scripts /trec-covid-ranker.py

geonmin-kim

Upload folder using huggingface_hub

d6585f5 over 1 year ago

raw

history blame

10.4 kB

	#
	# Pyserini: Reproducible IR research with sparse and dense representations
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import argparse
	import os
	import json
	import sys
	sys.path.append('..')
	sys.path.append('../pyserini')
	import subprocess

	from enum import Enum
	from pyserini.vectorizer import TfidfVectorizer
	from pyserini.vectorizer import BM25Vectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from typing import List
	from sklearn import preprocessing
	from typing import List, Set

	def normalize(scores):
	low = min(scores)
	high = max(scores)
	width = high - low

	return [(s-low)/width for s in scores]


	def sort_dual_list(pred, docs):
	zipped_lists = zip(pred, docs)
	sorted_pairs = sorted(zipped_lists)

	tuples = zip(*sorted_pairs)
	pred, docs = [list(tuple) for tuple in tuples]

	pred.reverse()
	docs.reverse()
	return pred, docs


	def sort_str_topics_list(topics: List[str]) -> List[str]:
	res = sorted([int(t) for t in topics])
	return [str(t) for t in res]


	def get_topics_from_qrun(path: str) -> Set[str]:
	res = set()
	with open(path, 'r') as f:
	for line in f:
	res.add(line.split()[0])
	return sort_str_topics_list(res)


	def get_lines_by_topic(path, topic, tag):
	res = []
	with open(path, 'r') as f:
	for line in f:
	tokens = line.split()
	if tokens[0] != topic:
	continue
	tokens[-1] = tag
	new_line = ' '.join(tokens)
	res.append(new_line)

	return res


	def read_qrels(path: str):
	qrels = []

	with open(path, 'r') as f:
	for line in f:
	line = line.strip()
	tokens = line.split()
	topic = tokens[0]
	doc_id = tokens[-2]
	relevance = int(tokens[-1])
	qrels.append({
	'topic': topic,
	'doc_id': doc_id,
	'relevance': relevance
	})

	return qrels


	def get_doc_to_id_from_qrun_by_topic(path: str, topic: str):
	res = {}
	with open(path, 'r') as f:
	for line in f:
	tokens = line.strip().split()
	t = tokens[0]
	if topic != t:
	continue
	doc_id = tokens[2]
	score = float(tokens[-2])
	res[doc_id] = score

	return res


	def get_docs_from_qrun_by_topic(path: str, topic: str):
	x, y = [], []
	with open(path, 'r') as f:
	for line in f:
	tokens = line.strip().split()
	t = tokens[0]
	if topic != t:
	continue
	doc_id = tokens[2]
	score = float(tokens[-2])
	x.append(doc_id)
	y.append(score)

	return x, y


	def get_X_Y_from_qrels_by_topic(path: str, topic: str, R: List[int]):
	# always include topic 0
	R.append(0)
	qrels = [qrel for qrel in read_qrels(path) if qrel['topic'] == topic and qrel['relevance'] in R]
	x, y = [], []
	for pack in qrels:
	x.append(pack['doc_id'])
	label = 0 if pack['relevance'] == 0 else 1
	y.append(label)

	return x, y


	class SpecterVectorizer:
	def __init__(self):
	path = "data/specter.csv"
	self.vectors = {}

	with open(path, 'r') as f:
	for line in f:
	tokens = line.strip().split(',')
	doc_id = tokens[0]
	vector = [float(item) for item in tokens[1:]]
	self.vectors[doc_id] = vector

	def get_vectors(self, doc_ids: List[str]):
	res = []

	for doc_id in doc_ids:
	if doc_id in self.vectors:
	res.append(self.vectors[doc_id])
	else:
	print(f'{doc_id} not found')

	return preprocessing.normalize(res)


	class ClassifierType(Enum):
	SVM = 'svm'
	LR = 'lr'
	NB = 'nb'


	ClassifierStr = {
	ClassifierType.SVM: 'svm',
	ClassifierType.LR: 'lr',
	ClassifierType.NB: 'nb',
	}


	class VectorizerType(Enum):
	TFIDF = 'tfidf'
	BM25 = 'bm25'
	SPECTER = 'specter'


	VectorizerStr = {
	VectorizerType.TFIDF: 'tfidf',
	VectorizerType.BM25: 'bm25',
	VectorizerType.SPECTER: 'specter',
	}


	def evaluate(qrels_path: str, run_path: str, options: str = ''):
	curdir = os.getcwd()
	if curdir.endswith('clprf'):
	anserini_root = '../../../anserini'
	else:
	anserini_root = '../anserini'
	prefix = f"{anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -c -M1000 -m all_trec {qrels_path}"
	cmd1 = f"{prefix} {run_path} {options} \| grep 'ndcg_cut_20 '"
	cmd2 = f"{prefix} {run_path} {options} \| grep 'map '"
	ndcg_score = str(subprocess.check_output(cmd1, shell=True)).split('\\t')[-1]
	map_score = str(subprocess.check_output(cmd2, shell=True)).split('\\t')[-1]
	return str(map_score),str(ndcg_score)


	def rank(new_qrels: str, base: str,tmp_base:str, qrels_path: str, lucene_index_path: str, R: List[int], score_path: str, alpha: float, clf_type: ClassifierType, vec_type: VectorizerType, tag: str):
	# build output path
	base_str = base.split('/')[-1]
	R_str = ''.join([str(i) for i in R])
	curdir = os.getcwd()
	if curdir.endswith('integrations'):
	output_path = f'{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt'
	else:
	output_path = f'integrations/{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt'
	print(f'Output -> {output_path}')
	os.system('mkdir -p runs')

	vectorizer = None
	if vec_type == VectorizerType.TFIDF:
	vectorizer = TfidfVectorizer(lucene_index_path, min_df=5)
	elif vec_type == VectorizerType.SPECTER:
	base += '.specter'
	qrels_path += '.specter'
	vectorizer = SpecterVectorizer()
	elif vec_type == VectorizerType.BM25:
	vectorizer = BM25Vectorizer(lucene_index_path, min_df=5)
	else:
	print('invalid vectorizer')
	exit()

	f = open(output_path, 'w+')

	skipped_topics = set()
	topics = get_topics_from_qrun(base)
	for topic in topics:
	train_docs, train_labels = get_X_Y_from_qrels_by_topic(qrels_path, topic, R)
	if len(train_docs) == 0:
	print(f'[topic][{topic}] skipped')
	skipped_topics.add(topic)
	continue

	print(f'[topic][{topic}] eligible train docs {len(train_docs)}')

	clf = None
	if clf_type == ClassifierType.NB:
	clf = MultinomialNB()
	elif clf_type == ClassifierType.LR:
	clf = LogisticRegression()
	elif clf_type == ClassifierType.SVM:
	clf = SVC(kernel='linear', probability=True)
	else:
	print('ClassifierType not supported')
	exit()

	train_vectors = vectorizer.get_vectors(train_docs)
	clf.fit(train_vectors, train_labels)

	test_docs, base_scores = get_docs_from_qrun_by_topic(base, topic)
	print(f'[topic][{topic}] eligible test docs {len(test_docs)}')
	test_vectors = vectorizer.get_vectors(test_docs)

	rank_scores = clf.predict_proba(test_vectors)
	rank_scores = [row[1] for row in rank_scores]

	rank_scores = normalize(rank_scores)
	base_scores = normalize(base_scores)

	preds = [a * alpha + b * (1-alpha) for a, b in zip(rank_scores, base_scores)]
	preds, docs = sort_dual_list(preds, test_docs)

	for index, (score, doc_id) in enumerate(zip(preds, docs)):
	rank = index + 1
	f.write(f'{topic} Q0 {doc_id} {rank} {score} {tag}\n')

	for topic in sort_str_topics_list(list(skipped_topics)):
	lines = get_lines_by_topic(base, topic, tag)
	print(f'Copying over skipped topic {topic} with {len(lines)} lines')
	for line in lines:
	f.write(f'{line}\n')

	f.close()
	map_score,ndcg_score = evaluate(new_qrels, output_path)
	with open(score_path, 'w') as outfile:
	json.dump({'map':map_score,'ndcg':ndcg_score}, outfile)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description='use tfidf vectorizer on cord-19 dataset with ccrf technique')
	parser.add_argument('-tag', type=str, default="interpolation",
	metavar="tag_name", help='tag name for resulting Qrun')
	parser.add_argument('-new_qrels', type=str, default="data/qrels-rnd1+2+3+4.txt",
	metavar="path_to_new_qrels", help='path to new_qrels file')
	parser.add_argument('-base', type=str, default="data/covidex.t5.final.txt",
	metavar="path_to_base_run", help='path to base run')
	parser.add_argument('-tmp_base', type=str, default="tmp101}",
	metavar="tmp file folder name", help='"tmp file folder name')
	parser.add_argument('-qrels', type=str, default="data/qrels-rnd1+2.txt",
	metavar="path_to_qrels", help='path to qrels file')
	parser.add_argument('-index', type=str, default="data/lucene-index-cord19-abstract-2020-05-19",
	metavar="path_to_lucene_index", help='path to lucene index folder')
	parser.add_argument('-output', type=str, default="data/output.json",
	metavar="path_to_base_run", help='the path to map and ndcg scores')
	parser.add_argument('-alpha', type=float, required=True, help='alpha value for interpolation')
	parser.add_argument('-clf', type=ClassifierType, required=True, help='which classifier to use')
	parser.add_argument('-vectorizer', type=VectorizerType, required=True, help='which vectorizer to use')
	args = parser.parse_args()

	R = [1, 2]
	print('Using base run:', args.base)
	rank(args.new_qrels, args.base, args.tmp_base, args.qrels, args.index, R, args.output, args.alpha, args.clf, args.vectorizer, args.tag)