#Title: TREC_COVID_Round1_OHSU.py #Author: Jimmy Chen, School of Medicine, OHSU #Description: Generate 1000 documents per topic in Round 1 TREC_COVID and get trec_eval metrics # To replicate OSHU_RUN2 # Results: https://ir.nist.gov/covidSubmit/archive/round1/OHSU_RUN2.pdf # # In root pyserini directory: # # 1. wget https://www.dropbox.com/s/gtq2c3xq81mjowk/lucene-index-covid-full-text-2020-04-10.tar.gz # 2. tar xvfz lucene-index-covid-full-text-2020-04-10.tar.gz # 3. python bin/generate_trec_covid_round1_OSHU_RUN2.py # 4. trec_eval -c -q -M1000 -m all_trec qrels-rnd1.txt Round1_data/full_R1.txt import sys sys.path.insert(0, "./") import pandas as pd import numpy as np #import torch import os from tqdm.auto import tqdm import json from pyserini.search import pysearch import xml.etree.ElementTree as ET import requests import urllib.request from trectools import misc, TrecRun, TrecQrel, procedures from pyserini.analysis.pyanalysis import get_lucene_analyzer, Analyzer import nltk from nltk.corpus import stopwords #Round 1 indexes #Replace with url to folder containing your index R1_fulltext = 'lucene-index-covid-full-text-2020-04-10' #Download round 1 topics and parse into dataframe tree = ET.fromstring(requests.get('https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml').text) topicid = [] query = [] question = [] narrative = [] for child in tree.iter(): tag =child.tag text = child.text attrib = child.attrib if (tag == 'topic'): topicid.append(attrib['number']) if (tag == 'query'): query.append(text) if (tag == 'question'): question.append(text) if (tag == 'narrative'): narrative.append(text) #Join to CSV my_dict = {'Topic':topicid, 'Query':query, 'Question':question , 'Narrative':narrative} R1_topics = pd.DataFrame(my_dict) R1_topics = R1_topics[['Topic', 'Query', 'Question', 'Narrative']] curr_dir = os.getcwd() Pyserini_files = os.path.join(curr_dir, 'Round1_data') if (os.path.exists(Pyserini_files) == False): os.mkdir(Pyserini_files) #Topics full_searcher = pysearch.LuceneSearcher(R1_fulltext) #Configure searcher parameters full_searcher.set_bm25_similarity(k1=1.5, b=0.4) full_searcher.set_lm_dirichlet_similarity(mu = 2000) full_searcher.set_rm3_reranker(fb_terms=10, fb_docs=10, original_query_weight=0.5) #Stopwords for tokenization - manual review stopwords_manual = ['seek', 'seeking', 'look', 'looking', 'studies', 'study', 'information', 'about', 'range', 'studies', 'its', 'coronaviru', 'other', '2', '19', 'well', ' will', 'from', 'have', 'more', 'covid', 'any', 'what', 'should', 'may', 'due', 'help', 'non', 's', 'those', 'people', 'ways', 'all', 'gain', 'possible', 'toward', 'specifically', 'learned', 'number', 'proportion', 'including', 'etc', 'still', 'while', 'human', 'specific', 'result', 'results', 'assess', 'need', 'between', 'take', 'taking', 'patient', 'type', 'cause' ,'frequency', 'less', 'face', 'likely', 'infect', 'upon', 'develop', 'represent', 'promising', 'step', 'related', 'papers', 'describe', 'also', 'relevant', 'who', 'show', 'science', 'basic', 'complete', 'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has', 'approach', 'Studies', 'Stud', 'Inst', 'Divi' ,'Thomae', 'Brigham', 'Young', 'Univ', 'studies', 'volition', 'severe acute respiratory syndrome', 'affect', 'affected'] #NLTK stopwords nltk.download('stopwords') stopwords = list(set(stopwords.words('English'))) stopwords_manual = list(np.append(stopwords_manual, stopwords)) token_narrative_list = [] #Extract important narrative text for i in range(len(R1_topics)): analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz')) tokens = analyzer.analyze(R1_topics['Narrative'][i]) #Remove stopwords and duplicates from token tokens = [w for w in tokens if not w in stopwords_manual] tokens = list(set(tokens)) token_narrative_list.append(tokens) #Tokenize question token_question_list = [] #Extract important question text - NOT USED YET for i in range(len(R1_topics)): analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz')) tokens = analyzer.analyze(R1_topics['Question'][i]) #Remove stopwords and duplicates from token tokens = [w for w in tokens if not w in stopwords_manual] tokens = list(set(tokens)) token_question_list.append(tokens) #Anserini searcher can take both query and keywords #keywords_list = '2019-nCoV, SARS-CoV-2, COVID-19' keywords_list = 'COVID-19' #Extract search results from the searcher docid_list = [] rank_list = [] score_list = [] topic_id_list = [] title_list = [] doi_list = [] print('Searching topics for documents') #Search extra - will drop duplicates and excess to 1000 n_papers = 1100 for ii, row in R1_topics.iterrows(): query = R1_topics['Query'][ii] question = R1_topics['Question'][ii] topic_num = R1_topics['Topic'][ii] token_topic = ', '.join(token_narrative_list[ii]) token_question = ','.join(token_question_list[ii]) input_query = query + '. ' + token_question + '. ' + token_topic + ' . ' + keywords_list hits = full_searcher.search(q = input_query, k=n_papers) print(topic_num) #Each key is a qid, value is the anserini search list for i in tqdm(range(0, n_papers), position = 0, leave = True): topic_id_list.append(topic_num) docid_list.append(hits[i].docid) rank_list.append(str(i+1)) score_list.append(hits[i].score) title_list.append(hits[i].lucene_document.get("title")) doi_list.append('https://doi.org/' + str(hits[i].lucene_document.get("doi"))) #Make dataframe from lists generated from search def TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, run_param): #Run-tag for TREC run requirements Q0 = ['q0'] * len(topic_id_list) qid = [run_param] * len(topic_id_list) df = {'topic': topic_id_list , 'q0':Q0, 'docid':docid_list, 'rank':rank_list, 'score':score_list, 'title': title_list, 'doi':doi_list, 'qid':qid} df = pd.DataFrame(df) df = df[['topic', 'q0', 'docid', 'rank', 'score', 'title', 'doi', 'qid']] #Remove duplicates df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True) #Re-rank df['rank'] = df.groupby('topic')['score'].rank(ascending=False) df['rank'] = df['rank'].astype(int) df = df[df['rank'] <= 1000] #Reset index df.reset_index(drop=True, inplace=True) #Get columns for submission succinct_results = df[['topic', 'q0', 'docid', 'rank', 'score', 'qid']] return succinct_results full_df = TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, 'FullTxt_run') full_df.to_csv(os.path.join(Pyserini_files, 'full_R1.txt'), sep=' ', index=False, header=None) #Use Trec Eval to evaluate initial runs #Run TREC_Eval print('Running trec_eval on search results') r = requests.get('https://ir.nist.gov/covidSubmit/data/qrels-rnd1.txt') qrels_file = os.path.join(os.getcwd(), 'qrels.txt') with open(qrels_file, 'wb') as f: f.write(r.content) qrels = TrecQrel(qrels_file) #Generate metrics for all 3 indices (1000 docs retrieved for each) runs = procedures.list_of_runs_from_path(Pyserini_files, "*.txt") results = procedures.evaluate_runs(runs, qrels, per_query=True) p5 = procedures.extract_metric_from_results(results, "P_5") p10 = procedures.extract_metric_from_results(results, "P_10") Bpref = procedures.extract_metric_from_results(results, "bpref") Mean_avgP = procedures.extract_metric_from_results(results, 'map') #Aggregate results to dataframe runs_names = [os.path.basename(str(x)).split('.')[0] for x in runs] p5_list = [] p10_list = [] map_list = [] bpref_list = [] ndcg_list = [] for i in range(len(runs)): p5_list.append(p5[i][1]) p10_list.append(p10[i][1]) map_list.append(Mean_avgP[i][1]) bpref_list.append(Bpref[i][1]) Result_df = {'Run':runs_names, 'P@5': p5_list, 'P@10': p10_list, 'MAP': map_list, 'Bpref': bpref_list} Result_df = pd.DataFrame(Result_df) with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also print(Result_df)