geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
/*
Copyright (c) 2008 - Chris Buckley.
Permission is granted for use and modification of this file for
research, non-commercial purposes.
*/
#include "common.h"
#include "sysfunc.h"
#include "trec_eval.h"
#include "trec_format.h"
#include "functions.h"
/* Takes the top docs and judged docs for a query, and returns a
rel_rank object giving the ordered relevance values for retrieved
docs, plus relevance occurrence statistics, for every judgment
group in the query.
Relevance value is
value in text_qrels if docno is in text_qrels and was judged
(assumed to be a small non-negative integer)
RELVALUE_NONPOOL (-1) if docno is not in text_qrels
RELVALUE_UNJUDGED (-2) if docno is in text_qrels and was not judged.
This procedure may be called repeatedly for a given topic - returned
values are cached until the query changes.
results and rel_info formats must be "trec_results" and "qrels_jg"
respectively.
UNDEF returned if error, 0 if used cache values, 1 if new values.
*/
static int comp_rank_judged (), comp_sim_docno (), comp_docno ();
/* Definitions used for temporary and cached values */
typedef struct {
char *docno;
float sim;
long rank;
long rel;
} DOCNO_INFO;
/* Current cached query */
static char *current_query = "no query";
static long max_current_query = 0;
/* Space reserved for cached returned values */
static long *rel_levels;
static long max_rel_levels = 0;
static RES_RELS *jgs;
static long num_jgs;
static long max_num_jgs = 0;
static long *ranked_rel_list;
static long max_ranked_rel_list = 0;
static DOCNO_INFO *docno_info;
static long max_docno_info = 0;
int
te_form_res_rels_jg (const EPI *epi, const REL_INFO *rel_info,
const RESULTS *results, RES_RELS_JG *res_rels)
{
long i;
long num_results;
long jg;
TEXT_RESULTS_INFO *text_results_info;
TEXT_QRELS_JG_INFO *trec_qrels;
TEXT_QRELS *qrels_ptr, *end_qrels;
long max_rel;
long *rel_level_ptr;
if (0 == strcmp (current_query, results->qid)) {
/* Have done this query already. Return cached values */
res_rels->qid = results->qid;
res_rels->num_jgs = num_jgs;
res_rels->jgs = jgs;
return (0);
}
/* Check that format type of result info and rel info are correct */
if (strcmp ("qrels_jg", rel_info->rel_format) ||
strcmp ("trec_results", results->ret_format)) {
fprintf (stderr, "trec_eval: rel_info format not qrels_jg or results format not trec_results\n");
return (UNDEF);
}
/* Make sure enough space for query and save copy */
i = strlen(results->qid)+1;
if (NULL == (current_query =
te_chk_and_malloc (current_query, &max_current_query,
i, sizeof (char))))
return (UNDEF);
(void) strncpy (current_query, results->qid, i);
text_results_info = (TEXT_RESULTS_INFO *) results->q_results;
trec_qrels = (TEXT_QRELS_JG_INFO *) rel_info->q_rel_info;
num_results = text_results_info->num_text_results;
num_jgs = trec_qrels->num_text_qrels_jg;
/* Check and reserve space for output structure */
/* Need an output array of size num_results for each of the jg in queries */
/* Reserve space for temp structure copying results */
if (NULL == (ranked_rel_list =
te_chk_and_malloc (ranked_rel_list, &max_ranked_rel_list,
num_results * num_jgs, sizeof (long))) ||
NULL == (jgs = te_chk_and_malloc (jgs, &max_num_jgs,
num_jgs, sizeof (RES_RELS))) ||
NULL == (docno_info =
te_chk_and_malloc (docno_info, &max_docno_info,
num_results, sizeof (DOCNO_INFO))))
return (UNDEF);
for (i = 0; i < num_results; i++) {
docno_info[i].docno = text_results_info->text_results[i].docno;
docno_info[i].sim = text_results_info->text_results[i].sim;
}
/* Sort results by sim, breaking ties lexicographically using docno */
qsort ((char *) docno_info,
(int) num_results,
sizeof (DOCNO_INFO),
comp_sim_docno);
/* Only look at epi->max_num_docs_per_topic (not normally an issue) */
if (num_results > epi->max_num_docs_per_topic)
num_results = epi->max_num_docs_per_topic;
/* Add ranks to docno_info (starting at 1) */
for (i = 0; i < num_results; i++) {
docno_info[i].rank = i+1;
}
/* Sort trec_top lexicographically */
qsort ((char *) docno_info,
(int) num_results,
sizeof (DOCNO_INFO),
comp_docno);
/* Error checking for duplicates */
for (i = 1; i < num_results; i++) {
if (0 == strcmp (docno_info[i].docno,
docno_info[i-1].docno)) {
fprintf (stderr, "trec_eval.form_res_qrels: duplicate docs %s",
docno_info[i].docno);
return (UNDEF);
}
}
/* Find max_rel among all JGs, reserve and zero space for rel_levels */
max_rel = 0;
for (jg = 0; jg < trec_qrels->num_text_qrels_jg; jg++) {
qrels_ptr = trec_qrels->text_qrels_jg[jg].text_qrels;
end_qrels = &trec_qrels->text_qrels_jg[jg].text_qrels
[trec_qrels->text_qrels_jg[jg].num_text_qrels];
while (qrels_ptr < end_qrels) {
if (max_rel < qrels_ptr->rel)
max_rel = qrels_ptr->rel;
qrels_ptr++;
}
}
if (NULL == (rel_levels =
te_chk_and_malloc (rel_levels, &max_rel_levels,
(max_rel+1) * num_jgs,
sizeof (long))))
return (UNDEF);
(void) memset (rel_levels, 0, (max_rel+1) * num_jgs * sizeof (long));
/* Construct rank_rel array and rel_levels separately for each JG */
for (jg = 0; jg < trec_qrels->num_text_qrels_jg; jg++) {
/* Go through docno_info, trec_qrels in parallel to determine relevance
for each doc in docno_info.
Note that trec_qrels already sorted by docno with no duplicates */
rel_level_ptr = &rel_levels[(max_rel+1) * jg];
qrels_ptr = trec_qrels->text_qrels_jg[jg].text_qrels;
end_qrels = &trec_qrels->text_qrels_jg[jg].text_qrels
[trec_qrels->text_qrels_jg[jg].num_text_qrels];
for (i = 0; i < num_results; i++) {
while (qrels_ptr < end_qrels &&
strcmp (qrels_ptr->docno, docno_info[i].docno) < 0) {
if (qrels_ptr->rel >= 0)
rel_level_ptr[qrels_ptr->rel]++;
qrels_ptr++;
}
if (qrels_ptr >= end_qrels ||
strcmp (qrels_ptr->docno, docno_info[i].docno) > 0) {
/* Doc is non-judged */
docno_info[i].rel = RELVALUE_NONPOOL;
}
else {
/* Doc is in pool, assign relevance */
if (qrels_ptr->rel < 0)
/* In pool, but unjudged (eg, infAP uses a sample of pool)*/
docno_info[i].rel = RELVALUE_UNJUDGED;
else {
docno_info[i].rel = qrels_ptr->rel;
}
if (qrels_ptr->rel >= 0)
rel_level_ptr[qrels_ptr->rel]++;
qrels_ptr++;
}
}
/* Finish counting rels */
while (qrels_ptr < end_qrels) {
if (qrels_ptr->rel >= 0)
rel_level_ptr[qrels_ptr->rel]++;
qrels_ptr++;
}
/* Construct ranked_rel_list and associated counts */
jgs[jg].num_rel_ret = 0;
jgs[jg].num_nonpool = 0;
jgs[jg].num_unjudged_in_pool = 0;
jgs[jg].results_rel_list = &ranked_rel_list[jg * num_results];
jgs[jg].rel_levels = &rel_levels[jg * (max_rel + 1)];
if (epi->judged_docs_only_flag) {
/* If judged_docs_only_flag, then must fix up ranks to
reflect unjudged docs being thrown out. Note: done this way
to preserve original tie-breaking based on text docno */
long rrl;
/* Sort tuples by increasing rank among judged docs*/
qsort ((char *) docno_info,
(int) num_results,
sizeof (DOCNO_INFO),
comp_rank_judged);
rrl = 0; i = 0;
while (i < num_results && docno_info[i].rel >= 0) {
if (docno_info[i].rel >= epi->relevance_level)
jgs[jg].num_rel_ret++;
jgs[jg].results_rel_list[rrl++] = docno_info[i++].rel;
}
jgs[jg].num_ret = rrl;
/* resort by docno for next jg */
if (jg != num_jgs-1)
qsort ((char *) docno_info,
(int) num_results,
sizeof (DOCNO_INFO),
comp_docno);
}
else {
/* Normal path. Assign rel value to appropriate rank */
for (i = 0; i < num_results; i++) {
jgs[jg].results_rel_list[docno_info[i].rank - 1] =
docno_info[i].rel;
if (RELVALUE_NONPOOL == docno_info[i].rel)
jgs[jg].num_nonpool++;
else if (RELVALUE_UNJUDGED == docno_info[i].rel)
jgs[jg].num_unjudged_in_pool++;
else {
if (docno_info[i].rel >= epi->relevance_level)
jgs[jg].num_rel_ret++;
}
}
jgs[jg].num_ret = num_results;
}
jgs[jg].num_rel = 0;
for (i = 0; i <= max_rel; i++) {
if (jgs[jg].rel_levels[i]) {
jgs[jg].num_rel_levels = i + 1;
if (i >= epi->relevance_level)
jgs[jg].num_rel += jgs[jg].rel_levels[i];
}
}
}
res_rels->qid = results->qid;
res_rels->num_jgs = num_jgs;
res_rels->jgs = jgs;
return (1);
}
static int
comp_rank_judged (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
if (ptr1->rel >= 0 && ptr2->rel >= 0) {
if (ptr1->rank < ptr2->rank)
return (-1);
if (ptr1->rank > ptr2->rank)
return (1);
return (0);
}
if (ptr1->rel >= 0)
return (-1);
if (ptr2->rel >= 0)
return (1);
return(0);
}
static int
comp_sim_docno (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
if (ptr1->sim > ptr2->sim)
return (-1);
if (ptr1->sim < ptr2->sim)
return (1);
return (strcmp (ptr2->docno, ptr1->docno));
}
static int
comp_docno (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
return (strcmp (ptr1->docno, ptr2->docno));
}
int
te_form_res_rels_jg_cleanup ()
{
if (max_current_query > 0) {
Free (current_query);
max_current_query = 0;
current_query = "no_query";
}
if (max_rel_levels > 0) {
Free (rel_levels);
max_rel_levels = 0;
}
if (max_num_jgs > 0) {
Free (jgs);
max_num_jgs = 0;
num_jgs = 0;
}
if (max_ranked_rel_list > 0) {
Free (ranked_rel_list);
max_ranked_rel_list = 0;
}
if (max_docno_info > 0) {
Free (docno_info);
max_docno_info = 0;
}
return (1);
}