Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

File size: 10,433 Bytes

d6585f5

#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import os
import json
import sys
sys.path.append('..')
sys.path.append('../pyserini')
import subprocess

from enum import Enum
from pyserini.vectorizer import TfidfVectorizer
from pyserini.vectorizer import BM25Vectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from typing import List
from sklearn import preprocessing
from typing import List, Set

def normalize(scores):
    low = min(scores)
    high = max(scores)
    width = high - low

    return [(s-low)/width for s in scores]


def sort_dual_list(pred, docs):
    zipped_lists = zip(pred, docs)
    sorted_pairs = sorted(zipped_lists)

    tuples = zip(*sorted_pairs)
    pred, docs = [list(tuple) for tuple in tuples]

    pred.reverse()
    docs.reverse()
    return pred, docs


def sort_str_topics_list(topics: List[str]) -> List[str]:
    res = sorted([int(t) for t in topics])
    return [str(t) for t in res]


def get_topics_from_qrun(path: str) -> Set[str]:
    res = set()
    with open(path, 'r') as f:
        for line in f:
            res.add(line.split()[0])
    return sort_str_topics_list(res)


def get_lines_by_topic(path, topic, tag):
    res = []
    with open(path, 'r') as f:
        for line in f:
            tokens = line.split()
            if tokens[0] != topic:
                continue
            tokens[-1] = tag
            new_line = ' '.join(tokens)
            res.append(new_line)

    return res


def read_qrels(path: str):
    qrels = []

    with open(path, 'r') as f:
        for line in f:
            line = line.strip()
            tokens = line.split()
            topic = tokens[0]
            doc_id = tokens[-2]
            relevance = int(tokens[-1])
            qrels.append({
                'topic': topic,
                'doc_id': doc_id,
                'relevance': relevance
            })

    return qrels


def get_doc_to_id_from_qrun_by_topic(path: str, topic: str):
    res = {}
    with open(path, 'r') as f:
        for line in f:
            tokens = line.strip().split()
            t = tokens[0]
            if topic != t:
                continue
            doc_id = tokens[2]
            score = float(tokens[-2])
            res[doc_id] = score

    return res


def get_docs_from_qrun_by_topic(path: str, topic: str):
    x, y = [], []
    with open(path, 'r') as f:
        for line in f:
            tokens = line.strip().split()
            t = tokens[0]
            if topic != t:
                continue
            doc_id = tokens[2]
            score = float(tokens[-2])
            x.append(doc_id)
            y.append(score)

    return x, y


def get_X_Y_from_qrels_by_topic(path: str, topic: str, R: List[int]):
    # always include topic 0
    R.append(0)
    qrels = [qrel for qrel in read_qrels(path) if qrel['topic'] == topic and qrel['relevance'] in R]
    x, y = [], []
    for pack in qrels:
        x.append(pack['doc_id'])
        label = 0 if pack['relevance'] == 0 else 1
        y.append(label)

    return x, y


class SpecterVectorizer:
    def __init__(self):
        path = "data/specter.csv"
        self.vectors = {}

        with open(path, 'r') as f:
            for line in f:
                tokens = line.strip().split(',')
                doc_id = tokens[0]
                vector = [float(item) for item in tokens[1:]]
                self.vectors[doc_id] = vector

    def get_vectors(self, doc_ids: List[str]):
        res = []

        for doc_id in doc_ids:
            if doc_id in self.vectors:
                res.append(self.vectors[doc_id])
            else:
                print(f'{doc_id} not found')

        return preprocessing.normalize(res)


class ClassifierType(Enum):
    SVM = 'svm'
    LR = 'lr'
    NB = 'nb'


ClassifierStr = {
    ClassifierType.SVM: 'svm',
    ClassifierType.LR: 'lr',
    ClassifierType.NB: 'nb',
}


class VectorizerType(Enum):
    TFIDF = 'tfidf'
    BM25 = 'bm25'
    SPECTER = 'specter'


VectorizerStr = {
    VectorizerType.TFIDF: 'tfidf',
    VectorizerType.BM25: 'bm25',
    VectorizerType.SPECTER: 'specter',
}


def evaluate(qrels_path: str, run_path: str, options: str = ''):        
    curdir = os.getcwd()
    if curdir.endswith('clprf'):
       anserini_root = '../../../anserini'
    else:
       anserini_root = '../anserini'
    prefix = f"{anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -c -M1000 -m all_trec {qrels_path}"
    cmd1 = f"{prefix} {run_path} {options} | grep 'ndcg_cut_20 '"
    cmd2 = f"{prefix} {run_path} {options} | grep 'map                   	'"
    ndcg_score = str(subprocess.check_output(cmd1, shell=True)).split('\\t')[-1]
    map_score = str(subprocess.check_output(cmd2, shell=True)).split('\\t')[-1]
    return str(map_score),str(ndcg_score)


def rank(new_qrels: str, base: str,tmp_base:str, qrels_path: str, lucene_index_path: str, R: List[int], score_path: str, alpha: float, clf_type: ClassifierType, vec_type: VectorizerType, tag: str):
    # build output path
    base_str = base.split('/')[-1]
    R_str = ''.join([str(i) for i in R])
    curdir = os.getcwd()
    if curdir.endswith('integrations'):
       output_path = f'{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt'
    else:
       output_path = f'integrations/{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt'
    print(f'Output -> {output_path}')
    os.system('mkdir -p runs')

    vectorizer = None
    if vec_type == VectorizerType.TFIDF:
        vectorizer = TfidfVectorizer(lucene_index_path, min_df=5)
    elif vec_type == VectorizerType.SPECTER:
        base += '.specter'
        qrels_path += '.specter'
        vectorizer = SpecterVectorizer()
    elif vec_type == VectorizerType.BM25:
        vectorizer = BM25Vectorizer(lucene_index_path, min_df=5)
    else:
        print('invalid vectorizer')
        exit()

    f = open(output_path, 'w+')

    skipped_topics = set()
    topics = get_topics_from_qrun(base)
    for topic in topics:
        train_docs, train_labels = get_X_Y_from_qrels_by_topic(qrels_path, topic, R)
        if len(train_docs) == 0:
            print(f'[topic][{topic}] skipped')
            skipped_topics.add(topic)
            continue

        print(f'[topic][{topic}] eligible train docs {len(train_docs)}')

        clf = None
        if clf_type == ClassifierType.NB:
            clf = MultinomialNB()
        elif clf_type == ClassifierType.LR:
            clf = LogisticRegression()
        elif clf_type == ClassifierType.SVM:
            clf = SVC(kernel='linear', probability=True)
        else:
            print('ClassifierType not supported')
            exit()

        train_vectors = vectorizer.get_vectors(train_docs)
        clf.fit(train_vectors, train_labels)

        test_docs, base_scores = get_docs_from_qrun_by_topic(base, topic)
        print(f'[topic][{topic}] eligible test docs {len(test_docs)}')
        test_vectors = vectorizer.get_vectors(test_docs)

        rank_scores = clf.predict_proba(test_vectors)
        rank_scores = [row[1] for row in rank_scores]

        rank_scores = normalize(rank_scores)
        base_scores = normalize(base_scores)

        preds = [a * alpha + b * (1-alpha) for a, b in zip(rank_scores, base_scores)]
        preds, docs = sort_dual_list(preds, test_docs)

        for index, (score, doc_id) in enumerate(zip(preds, docs)):
            rank = index + 1
            f.write(f'{topic} Q0 {doc_id} {rank} {score} {tag}\n')

    for topic in sort_str_topics_list(list(skipped_topics)):
        lines = get_lines_by_topic(base, topic, tag)
        print(f'Copying over skipped topic {topic} with {len(lines)} lines')
        for line in lines:
            f.write(f'{line}\n')

    f.close()
    map_score,ndcg_score = evaluate(new_qrels, output_path)
    with open(score_path, 'w') as outfile:
    	json.dump({'map':map_score,'ndcg':ndcg_score}, outfile)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='use tfidf vectorizer on cord-19 dataset with ccrf technique')
    parser.add_argument('-tag', type=str, default="interpolation",
                        metavar="tag_name", help='tag name for resulting Qrun')
    parser.add_argument('-new_qrels', type=str, default="data/qrels-rnd1+2+3+4.txt",
                        metavar="path_to_new_qrels", help='path to new_qrels file')
    parser.add_argument('-base', type=str, default="data/covidex.t5.final.txt",
                        metavar="path_to_base_run", help='path to base run')
    parser.add_argument('-tmp_base', type=str, default="tmp101}",
                        metavar="tmp file folder name", help='"tmp file folder name')
    parser.add_argument('-qrels', type=str, default="data/qrels-rnd1+2.txt",
                        metavar="path_to_qrels", help='path to qrels file')
    parser.add_argument('-index', type=str, default="data/lucene-index-cord19-abstract-2020-05-19",
                        metavar="path_to_lucene_index", help='path to lucene index folder')
    parser.add_argument('-output', type=str, default="data/output.json",
                        metavar="path_to_base_run", help='the path to map and ndcg scores')
    parser.add_argument('-alpha', type=float, required=True, help='alpha value for interpolation')
    parser.add_argument('-clf', type=ClassifierType, required=True, help='which classifier to use')
    parser.add_argument('-vectorizer', type=VectorizerType, required=True, help='which vectorizer to use')
    args = parser.parse_args()

    R = [1, 2]
    print('Using base run:', args.base)
    rank(args.new_qrels, args.base, args.tmp_base, args.qrels, args.index, R, args.output, args.alpha, args.clf, args.vectorizer, args.tag)