geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
/*
Copyright (c) 2008 - Chris Buckley.
Permission is granted for use and modification of this file for
research, non-commercial purposes.
*/
#include "common.h"
#include "sysfunc.h"
#include "trec_eval.h"
#include "trec_format.h"
#include "functions.h"
/* Takes the top docs and judged docs for a query, and returns a
rel_rank object giving the ordered relevance values for retrieved
docs, plus relevance occurrence statatistics.
Relevance value is
value in text_qrels if docno is in text_qrels and was judged
(assumed to be a small non-negative integer)
RELVALUE_NONPOOL (-1) if docno is not in text_qrels
RELVALUE_UNJUDGED (-2) if docno is in text_qrels and was not judged.
This procedure may be called repeatedly for a given topic - returned
values are cached until the query changes.
results and rel_info formats must be "trec_results" and "qrels"
respectively.
UNDEF returned if error, 0 if used cache values, 1 if new values.
*/
static int comp_rank_judged (), comp_sim_docno (), comp_docno ();
/* Definitions used for temporary and cached values */
typedef struct {
char *docno;
float sim;
long rank;
long rel;
} DOCNO_INFO;
/* Current cached query */
static char *current_query = "no query";
static long max_current_query = 0;
/* Space reserved for cached returned values */
static long *rel_levels;
static long max_rel_levels = 0;
static RES_RELS saved_res_rels;
static long *ranked_rel_list;
static long max_ranked_rel_list = 0;
/* Space reserved for intermediate values */
static DOCNO_INFO *docno_info;
static long max_docno_info = 0;
int
te_form_res_rels (const EPI *epi, const REL_INFO *rel_info,
const RESULTS *results, RES_RELS *res_rels)
{
long i;
long num_results;
long max_rel;
TEXT_RESULTS_INFO *text_results_info;
TEXT_QRELS_INFO *trec_qrels;
TEXT_QRELS *qrels_ptr, *end_qrels;
if (0 == strcmp (current_query, results->qid)) {
/* Have done this query already. Return cached values */
*res_rels = saved_res_rels;
return (0);
}
/* Check that format type of result info and rel info are correct */
if (strcmp ("qrels", rel_info->rel_format) ||
strcmp ("trec_results", results->ret_format)) {
fprintf (stderr, "trec_eval.form_res_qrels: rel_info format not qrels or results format not trec_results\n");
return (UNDEF);
}
/* Make sure enough space for query and save copy */
i = strlen(results->qid)+1;
if (NULL == (current_query =
te_chk_and_malloc (current_query, &max_current_query,
i, sizeof (char))))
return (UNDEF);
(void) strncpy (current_query, results->qid, i);
text_results_info = (TEXT_RESULTS_INFO *) results->q_results;
trec_qrels = (TEXT_QRELS_INFO *) rel_info->q_rel_info;
num_results = text_results_info->num_text_results;
/* Check and reserve space for output structure */
/* Reserve space for temp structure copying results */
if (NULL == (ranked_rel_list =
te_chk_and_malloc (ranked_rel_list, &max_ranked_rel_list,
num_results, sizeof (long))) ||
NULL == (docno_info =
te_chk_and_malloc (docno_info, &max_docno_info,
num_results, sizeof (DOCNO_INFO))))
return (UNDEF);
for (i = 0; i < num_results; i++) {
docno_info[i].docno = text_results_info->text_results[i].docno;
docno_info[i].sim = text_results_info->text_results[i].sim;
}
/* Sort results by sim, breaking ties lexicographically using docno */
qsort ((char *) docno_info,
(int) num_results,
sizeof (DOCNO_INFO),
comp_sim_docno);
/* Only look at epi->max_num_docs_per_topic (not normally an issue) */
if (num_results > epi->max_num_docs_per_topic)
num_results = epi->max_num_docs_per_topic;
/* Add ranks to docno_info (starting at 1) */
for (i = 0; i < num_results; i++) {
docno_info[i].rank = i+1;
}
/* Sort trec_top lexicographically */
qsort ((char *) docno_info,
(int) num_results,
sizeof (DOCNO_INFO),
comp_docno);
/* Error checking for duplicates */
for (i = 1; i < num_results; i++) {
if (0 == strcmp (docno_info[i].docno,
docno_info[i-1].docno)) {
fprintf (stderr, "trec_eval.form_res_qrels: duplicate docs %s",
docno_info[i].docno);
return (UNDEF);
}
}
/* Find max_rel among qid, reserve and zero space for rel_levels */
/* Check for duplicate docnos. */
qrels_ptr = trec_qrels->text_qrels;
end_qrels = &trec_qrels->text_qrels [trec_qrels->num_text_qrels];
max_rel = qrels_ptr->rel;
qrels_ptr++;
while (qrels_ptr < end_qrels) {
if (max_rel < qrels_ptr->rel)
max_rel = qrels_ptr->rel;
if (0 == strcmp ((qrels_ptr-1)->docno, qrels_ptr->docno)) {
fprintf (stderr, "trec_eval.form_res_rels: duplicate docs %s\n",
qrels_ptr->docno);
return (UNDEF);
}
qrels_ptr++;
}
if (NULL == (rel_levels =
te_chk_and_malloc (rel_levels, &max_rel_levels,
(max_rel+1),
sizeof (long))))
return (UNDEF);
(void) memset (rel_levels, 0, (max_rel+1) * sizeof (long));
/* Construct rank_rel array and rel_levels */
/* Go through docno_info, trec_qrels in parallel to determine relevance
for each doc in docno_info.
Note that trec_qrels already sorted by docno with no duplicates */
qrels_ptr = trec_qrels->text_qrels;
end_qrels = &trec_qrels->text_qrels [trec_qrels->num_text_qrels];
for (i = 0; i < num_results; i++) {
while (qrels_ptr < end_qrels &&
strcmp (qrels_ptr->docno, docno_info[i].docno) < 0) {
if (qrels_ptr->rel >= 0)
rel_levels[qrels_ptr->rel]++;
qrels_ptr++;
}
if (qrels_ptr >= end_qrels ||
strcmp (qrels_ptr->docno, docno_info[i].docno) > 0) {
/* Doc is non-judged */
docno_info[i].rel = RELVALUE_NONPOOL;
}
else {
/* Doc is in pool, assign relevance */
if (qrels_ptr->rel < 0)
/* In pool, but unjudged (eg, infAP uses a sample of pool)*/
docno_info[i].rel = RELVALUE_UNJUDGED;
else {
docno_info[i].rel = qrels_ptr->rel;
}
if (qrels_ptr->rel >= 0)
rel_levels[qrels_ptr->rel]++;
qrels_ptr++;
}
}
/* Finish counting rels */
while (qrels_ptr < end_qrels) {
if (qrels_ptr->rel >= 0)
rel_levels[qrels_ptr->rel]++;
qrels_ptr++;
}
/* Construct ranked_rel_list and associated counts */
saved_res_rels.num_rel_ret = 0;
saved_res_rels.num_nonpool = 0;
saved_res_rels.num_unjudged_in_pool = 0;
saved_res_rels.results_rel_list = ranked_rel_list;
saved_res_rels.rel_levels = rel_levels;
if (epi->judged_docs_only_flag) {
/* If judged_docs_only_flag, then must fix up ranks to
reflect unjudged docs being thrown out. Note: done this way
to preserve original tie-breaking based on text docno */
long rrl;
/* Sort tuples by increasing rank among judged docs*/
qsort ((char *) docno_info,
(int) num_results,
sizeof (DOCNO_INFO),
comp_rank_judged);
rrl = 0; i = 0;
while (i < num_results && docno_info[i].rel >= 0) {
if (docno_info[i].rel >= epi->relevance_level)
saved_res_rels.num_rel_ret++;
saved_res_rels.results_rel_list[rrl++] = docno_info[i++].rel;
}
saved_res_rels.num_ret = rrl;
}
else {
/* Normal path. Assign rel value to appropriate rank */
for (i = 0; i < num_results; i++) {
saved_res_rels.results_rel_list[docno_info[i].rank - 1] =
docno_info[i].rel;
if (RELVALUE_NONPOOL == docno_info[i].rel)
saved_res_rels.num_nonpool++;
else if (RELVALUE_UNJUDGED == docno_info[i].rel)
saved_res_rels.num_unjudged_in_pool++;
else {
if (docno_info[i].rel >= epi->relevance_level)
saved_res_rels.num_rel_ret++;
}
}
saved_res_rels.num_ret = num_results;
}
saved_res_rels.num_rel = 0;
for (i = 0; i <= max_rel; i++) {
if (saved_res_rels.rel_levels[i]) {
saved_res_rels.num_rel_levels = i + 1;
if (i >= epi->relevance_level)
saved_res_rels.num_rel += saved_res_rels.rel_levels[i];
}
}
*res_rels = saved_res_rels;
return (1);
}
static int
comp_rank_judged (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
if (ptr1->rel >= 0 && ptr2->rel >= 0) {
if (ptr1->rank < ptr2->rank)
return (-1);
if (ptr1->rank > ptr2->rank)
return (1);
return (0);
}
if (ptr1->rel >= 0)
return (-1);
if (ptr2->rel >= 0)
return (1);
return(0);
}
static int
comp_sim_docno (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
if (ptr1->sim > ptr2->sim)
return (-1);
if (ptr1->sim < ptr2->sim)
return (1);
return (strcmp (ptr2->docno, ptr1->docno));
}
static int
comp_docno (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
return (strcmp (ptr1->docno, ptr2->docno));
}
int
te_form_res_rels_cleanup ()
{
if (max_current_query > 0) {
Free (current_query);
max_current_query = 0;
current_query = "no_query";
}
if (max_rel_levels > 0) {
Free (rel_levels);
max_rel_levels = 0;
}
if (max_ranked_rel_list > 0) {
Free (ranked_rel_list);
max_ranked_rel_list = 0;
}
if (max_docno_info > 0) {
Free (docno_info);
max_docno_info = 0;
}
return (1);
}