# # Pyserini: Reproducible IR research with sparse and dense representations # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import argparse import json import os import pandas as pd def get_file_extension(rm3: bool): return '_bm25+rm3.txt' if rm3 is True else '_bm25.txt' def get_file_path(run_file, collection, classifier, alpha: str, rm3: bool): res = f'{run_file}/{collection}/{collection}_{classifier}_A{alpha}' return res + get_file_extension(rm3) def get_res_file_path(run_file, collection, classifier, alpha: str, rm3: bool): res = f'{run_file}/scripts/classifier_prf/cv/{collection}/{collection}_{classifier}_A' + alpha return res + get_file_extension(rm3) def get_trec_eval_cmd(anserini_root: str): return os.path.join(anserini_root, 'tools/eval/trec_eval.9.0.4/trec_eval') def get_qrels_path(anserini_root: str, collection: str): return f"{anserini_root}/src/main/resources/topics-and-qrels/qrels.{collection}.txt" def read_topics_alpha_map(anserini_root, collection, run_file, classifier, rm3: bool): res_paths = [] for num in range(0, 11): alpha = str(num / 10) file_path = get_file_path(run_file, collection, classifier, alpha, rm3) cv_folder_path = os.path.join( run_file, f"scripts/classifier_prf/cv/{collection}") os.system(f"mkdir -p {cv_folder_path}") res_filename = get_res_file_path( run_file, collection, classifier, alpha, rm3) res_paths.append(res_filename) trec_eval_cmd = get_trec_eval_cmd(anserini_root) qrels_path = get_qrels_path(anserini_root, collection) cmd = f'{trec_eval_cmd} -q -m map -m P.30 {qrels_path} {file_path} > {res_filename}' res = os.system(cmd) if res == 0: print(file_path + ' run successfully!') print('save result in ' + res_filename) return res_paths def load_in_res(res_paths): df = pd.read_csv( res_paths[0], sep='\s+', header=None, names=['Type', 'topicid', '0.0'], dtype={'0.0': float}) df.set_index('topicid', inplace=True) for num in range(1, 11): dataset = pd.read_csv( res_paths[num], sep='\s+', header=None, names=['Type', 'topicid', 'score'], dtype={'topicid': str, 'score': float}) df[str(num / 10)] = dataset.score.values df = df[df['Type'] == 'map'][:-1] df = df.drop(['Type'], axis=1) return df def generate_run_file(folders, df, collection, run_file, classifier, rm3, output_path): highest_alpha_lst, write_lst = [], [] with open(output_path, 'w') as target_file: for folder in folders: train_topicids = [str(topic) for f in folders for topic in f if f != folder and str(topic) in df.index] train_df = df.loc[train_topicids, :] train_df.loc['Mean', :] = train_df.mean(axis=0) highest_alpha = train_df.iloc[-1, :].idxmax(axis=0) highest_alpha_lst.append(highest_alpha) for topic in folder: alpha_run_file = get_file_path( run_file, collection, classifier, highest_alpha, rm3) with open(alpha_run_file) as fp: Lines = fp.readlines() for line in Lines: if line.startswith(str(topic)): write_lst.append(line) write_lst.sort(key=lambda x: (x.split(" ")[0], int(x.split(" ")[3]))) target_file.write("".join(write_lst)) print(highest_alpha_lst) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Get Best alpha score for corresponding topics') parser.add_argument('--anserini', metavar='path', required=True, help='the path to anserini root') parser.add_argument('--pyserini', metavar='path', required=True, help='a path to the folder json file') parser.add_argument('--collection', metavar='collectionsname', required=True, help='one of the collectionname in robust04,robust05, core17,core18') parser.add_argument('--run_file', metavar='path', required=True, help='the path to run files root') parser.add_argument('--output', metavar='path', required=True, help='the path to the output file') parser.add_argument('--classifier', metavar='name', required=True, help='one of three classifers lr or svm or lr+svm') parser.add_argument('-rm3', action='store_true', help='use rm3 ranker') args = parser.parse_args() res_paths = read_topics_alpha_map( args.anserini, args.collection, args.run_file, args.classifier, args.rm3) clean_df = load_in_res(res_paths) folders_path = os.path.join( args.pyserini, f'scripts/classifier_prf/folds/{args.collection}.json') with open(folders_path) as f: folders = json.load(f) generate_run_file(folders, clean_df, args.collection, args.run_file, args.classifier, args.rm3, args.output) print("Successfully generated a trained runfile in " + args.output)