Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

File size: 5,717 Bytes

d6585f5

#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import json
import os
import pandas as pd


def get_file_extension(rm3: bool):
    return '_bm25+rm3.txt' if rm3 is True else '_bm25.txt'


def get_file_path(run_file, collection, classifier, alpha: str, rm3: bool):
    res = f'{run_file}/{collection}/{collection}_{classifier}_A{alpha}'
    return res + get_file_extension(rm3)


def get_res_file_path(run_file, collection, classifier, alpha: str, rm3: bool):
    res = f'{run_file}/scripts/classifier_prf/cv/{collection}/{collection}_{classifier}_A' + alpha
    return res + get_file_extension(rm3)


def get_trec_eval_cmd(anserini_root: str):
    return os.path.join(anserini_root, 'tools/eval/trec_eval.9.0.4/trec_eval')


def get_qrels_path(anserini_root: str, collection: str):
    return f"{anserini_root}/src/main/resources/topics-and-qrels/qrels.{collection}.txt"


def read_topics_alpha_map(anserini_root, collection, run_file, classifier, rm3: bool):
    res_paths = []

    for num in range(0, 11):
        alpha = str(num / 10)
        file_path = get_file_path(run_file, collection, classifier, alpha, rm3)
        cv_folder_path = os.path.join(
            run_file, f"scripts/classifier_prf/cv/{collection}")
        os.system(f"mkdir -p {cv_folder_path}")
        res_filename = get_res_file_path(
            run_file, collection, classifier, alpha, rm3)

        res_paths.append(res_filename)
        trec_eval_cmd = get_trec_eval_cmd(anserini_root)
        qrels_path = get_qrels_path(anserini_root, collection)
        cmd = f'{trec_eval_cmd} -q -m map -m P.30 {qrels_path} {file_path} > {res_filename}'
        res = os.system(cmd)
        if res == 0:
            print(file_path + ' run successfully!')
            print('save result in ' + res_filename)

    return res_paths


def load_in_res(res_paths):
    df = pd.read_csv(
        res_paths[0], sep='\s+', header=None,
        names=['Type', 'topicid', '0.0'], dtype={'0.0': float})
    df.set_index('topicid', inplace=True)

    for num in range(1, 11):
        dataset = pd.read_csv(
            res_paths[num], sep='\s+', header=None, names=['Type', 'topicid', 'score'],
            dtype={'topicid': str, 'score': float})
        df[str(num / 10)] = dataset.score.values

    df = df[df['Type'] == 'map'][:-1]
    df = df.drop(['Type'], axis=1)
    return df


def generate_run_file(folders, df, collection, run_file, classifier, rm3, output_path):
    highest_alpha_lst, write_lst = [], []

    with open(output_path, 'w') as target_file:
        for folder in folders:
            train_topicids = [str(topic)
                              for f in folders for topic in f if f != folder and str(topic) in df.index]
            train_df = df.loc[train_topicids, :]
            train_df.loc['Mean', :] = train_df.mean(axis=0)
            highest_alpha = train_df.iloc[-1, :].idxmax(axis=0)
            highest_alpha_lst.append(highest_alpha)

            for topic in folder:
                alpha_run_file = get_file_path(
                    run_file, collection, classifier, highest_alpha, rm3)

                with open(alpha_run_file) as fp:
                    Lines = fp.readlines()
                    for line in Lines:
                        if line.startswith(str(topic)):
                            write_lst.append(line)

        write_lst.sort(key=lambda x: (x.split(" ")[0], int(x.split(" ")[3])))
        target_file.write("".join(write_lst))

    print(highest_alpha_lst)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Get Best alpha score for corresponding topics')
    parser.add_argument('--anserini', metavar='path', required=True,
                        help='the path to anserini root')
    parser.add_argument('--pyserini', metavar='path', required=True,
                        help='a path to the folder json file')
    parser.add_argument('--collection', metavar='collectionsname', required=True,
                        help='one of the collectionname in robust04,robust05, core17,core18')
    parser.add_argument('--run_file', metavar='path', required=True,
                        help='the path to run files root')
    parser.add_argument('--output', metavar='path', required=True,
                        help='the path to the output file')
    parser.add_argument('--classifier', metavar='name', required=True,
                        help='one of three classifers lr or svm or lr+svm')
    parser.add_argument('-rm3', action='store_true',
                        help='use rm3 ranker')

    args = parser.parse_args()

    res_paths = read_topics_alpha_map(
        args.anserini, args.collection, args.run_file, args.classifier, args.rm3)
    clean_df = load_in_res(res_paths)
    folders_path = os.path.join(
        args.pyserini, f'scripts/classifier_prf/folds/{args.collection}.json')

    with open(folders_path) as f:
        folders = json.load(f)

    generate_run_file(folders, clean_df, args.collection,
                      args.run_file, args.classifier, args.rm3, args.output)

    print("Successfully generated a trained runfile in " + args.output)