def evaluate_pr(system, gold, system_score_cutoff=0, k=5, verbosity=0): """ Returns the precision,recall and f1 score @k. Also prints the precision,recall and f1 score @k=1 to 5. Parameters ---------- system : list of tuples System output for sentence in form (position, score). gold : list of tuple Gold standard for sentence in form (position, score). system_score_cutoff : float Threshold of importance score for system output, deafaul to 0. k : int Top k recommendations to be evaluate on. Returns ------- tuple A tuple contains precision, recall and f1 score for the system. """ # recommended by system and gold system = [i for i in system if i[1] > system_score_cutoff] # have the flexibility to change the number of recommendation gold = [i for i in gold if i[1] > 0] if len(gold)>k: n = len(gold) else: n = 0 # sort system.sort(key=lambda x: -x[1]) gold_sent = [j[0] for j in gold] # print("system:", system) # print("gold:", gold) for i in range(1, k + 1): # show how precision and recall change at different k num_correct = 0 if len(system) 0: print("k=", i, "\nprecision=", precision, "\nrecall=", recall) if n: num_correct = 0 sys = system[:n] for s in sys: if s[0] in gold_sent: num_correct += 1 precision = num_correct/len(sys) recall = num_correct/len(gold) if verbosity > 0: print("k=", i, "\nprecision=", precision, "\nrecall=", recall) try: f_score = 2 * precision * recall / (precision + recall) except: f_score = 0 if verbosity > 0: print("f1 score=", f_score) return (precision, recall, f_score) # return precision and recall at k=n, showing how the system performs by recommending the same number of sent as gold