Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / tools /eval /gdeval.pl

geonmin-kim

Upload folder using huggingface_hub

d6585f5 11 months ago

raw

history blame contribute delete

No virus

10.4 kB

	#!/usr/bin/perl -w

	# Graded relevance assessment script for the TREC 2010 Web track
	# Evalution measures are written to standard output in CSV format.
	#
	# Currently reports only NDCG and ERR
	# (see http://learningtorankchallenge.yahoo.com/instructions.php)

	use constant LOGBASEDIV => log(2.0);

	# gloals
	my $QRELS;
	my $VERSION = "version 1.3 (Mon Apr 29 20:50:24 EDT 2013)";
	my $MAX_JUDGMENT = 4; # Maximum gain value allowed in qrels file.
	my $K = 20; # Reporting depth for results.
	my $USAGE = "usage: $0 [options] qrels run\n
	options:\n
	-c
	Average over the complete set of topics in the relevance judgments
	instead of the topics in the intersection of relevance judgments
	and results.\n
	-k value
	Non-negative integer depth of ranking to evaluate in range [1,inf].
	Default value is k=@{[($K)]}.\n
	-baseline BASELINE_RUN_FILE
	Baseline run to use for risk-sensitive evaluation\n
	-riskAlpha value
	Non-negative Risk sensitivity value to use when doing risk-sensitive
	evaluation. A baseline must still be specified. By default 0.
	The final weight to downside changes in performance is (1+value).\n";



	use strict 'vars';

	{ # main block to scope variables
	if ($#ARGV >= 0 && ($ARGV[0] eq "-v" \|\| $ARGV[0] eq "-version")) {
	print "$0: $VERSION\n";
	exit 0;
	}

	my $baselineRun = undef;
	my $riskAlpha = 0;
	my $cflag = 0;
	while ($#ARGV != 1) # should probably replace this with perl's argument parsing
	{
	if ($#ARGV >= 0 && $ARGV[0] eq "-help") {
	print "$USAGE\n";
	exit 0;
	}
	elsif ($#ARGV >= 2 and ("-c" eq $ARGV[0]))
	{
	$cflag = 1;
	shift @ARGV;
	}
	elsif ($#ARGV >= 3 and ("-k" eq $ARGV[0]))
	{
	$K = int($ARGV[1]);
	die $USAGE if ($K < 1);
	# print STDERR "k=$K\n";
	shift @ARGV; shift @ARGV;
	}
	elsif ($#ARGV >= 3 and ("-baseline" eq $ARGV[0]))
	{
	$baselineRun = $ARGV[1];
	shift @ARGV; shift @ARGV;
	}
	elsif ($#ARGV >= 3 and ("-riskAlpha" eq $ARGV[0]))
	{
	$riskAlpha = $ARGV[1];
	die $USAGE if ($riskAlpha < 0.0);
	shift @ARGV; shift @ARGV;
	}
	else
	{
	die $USAGE;
	}
	}

	die $USAGE unless $#ARGV == 1;
	$QRELS = $ARGV[0];
	my $run = $ARGV[1];

	# Read qrels file, check format, and sort
	my @qrels = ();
	my %seen = ();
	open (QRELS,"<$QRELS") \|\| die "$0: cannot open \"$QRELS\": $!\n";
	while (<QRELS>) {
	s/[\r\n]//g;
	my ($topic, $zero, $docno, $judgment) = split (' ');
	$topic =~ s/^.*\-//;
	die "$0: format error on line $. of \"$QRELS\"\n"
	unless
	$topic =~ /^[0-9]+$/ && $zero == 0
	&& $judgment =~ /^-?[0-9]+$/ && $judgment <= $MAX_JUDGMENT;
	if ($judgment > 0) {
	$qrels[$#qrels + 1]= "$topic $docno $judgment";
	$seen{$topic} = 1;
	}
	}
	close (QRELS);
	@qrels = sort qrelsOrder (@qrels);

	# Process qrels: store judgments and compute ideal gains
	my $topicCurrent = -1;
	my %ideal = ();
	my @gain = ();
	my %judgment = ();
	for (my $i = 0; $i <= $#qrels; $i++) {
	my ($topic, $docno, $judgment) = split (' ', $qrels[$i]);
	if ($topic != $topicCurrent) {
	if ($topicCurrent >= 0) {
	$ideal{$topicCurrent} = &dcg($K, @gain);
	$#gain = -1;
	}
	$topicCurrent = $topic;
	}
	next if $judgment < 0;
	$judgment{"$topic:$docno"} = $gain[$#gain + 1] = $judgment;
	}
	if ($topicCurrent >= 0) {
	$ideal{$topicCurrent} = &dcg($K, @gain);
	$#gain = -1;
	}

	# process baseline if doing risk sensitive
	my ($baseNDCGByTopic,$baseERRByTopic,$baserunname);
	if (defined $baselineRun)
	{
	($baseNDCGByTopic,$baseERRByTopic,$baserunname) = processRun($baselineRun,0,\%seen,\%ideal,\%judgment,$cflag,0);
	}

	# process main run
	processRun($run,1,\%seen,\%ideal,\%judgment,$cflag,defined($baselineRun),$riskAlpha,$baserunname,$baseNDCGByTopic,$baseERRByTopic);

	exit 0;

	} # end main block

	# comparison function for qrels: by topic then judgment
	sub qrelsOrder {
	my ($topicA, $docnoA, $judgmentA) = split (' ', $a);
	my ($topicB, $docnoB, $judgmentB) = split (' ', $b);

	if ($topicA < $topicB) {
	return -1;
	} elsif ($topicA > $topicB) {
	return 1;
	} else {
	return $judgmentB <=> $judgmentA;
	}
	}

	# comparison function for runs: by topic then score then docno
	sub runOrder {
	my ($topicA, $docnoA, $scoreA) = split (' ', $a);
	my ($topicB, $docnoB, $scoreB) = split (' ', $b);

	if ($topicA < $topicB) {
	return -1;
	} elsif ($topicA > $topicB) {
	return 1;
	} elsif ($scoreA < $scoreB) {
	return 1;
	} elsif ($scoreA > $scoreB) {
	return -1;
	} elsif ($docnoA lt $docnoB) {
	return 1;
	} elsif ($docnoA gt $docnoB) {
	return -1;
	} else {
	return 0;
	}
	}

	# compute DCG over a sorted array of gain values, reporting at depth $k
	sub dcg {
	my ($k, @gain) = @_;
	my ($i, $score) = (0, 0);

	for ($i = 0; $i <= ($k <= $#gain ? $k - 1 : $#gain); $i++) {
	$score += (2**$gain[$i] - 1)/(log ($i + 2)/ +LOGBASEDIV);
	}
	return $score;
	}

	# compute ERR over a sorted array of gain values, reporting at depth $k
	sub err {
	my ($k, @gain) = @_;
	my ($i, $score, $decay, $r);

	$score = 0.0;
	$decay = 1.0;
	for ($i = 0; $i <= ($k <= $#gain ? $k - 1 : $#gain); $i++) {
	$r = (2$gain[$i] - 1)/(2$MAX_JUDGMENT);
	$score += $r*$decay/($i + 1);
	$decay *= (1 - $r);
	}
	return $score;
	}

	sub riskWeighted
	{
	my ($run,$base,$alpha) = @_;
	if ($run < $base)
	{
	$run = (1+$alpha) * ($run - $base);
	}
	else
	{
	$run = $run - $base;
	}
	return $run;
	}
	# compute and report information for current topic
	sub topicDone {
	my ($printTopic, $runid, $topic, $pndcgTotal, $perrTotal, $ptopics, $pseen, $pideal,
	$isRiskSensitive, $riskAlpha, $baseNDCG, $baseERR, @gain) = @_;
	my($ndcg, $err) = (0, 0);
	if (exists($$pseen{$topic}) and defined($$pseen{$topic}) and $$pseen{$topic}) {
	$ndcg = &dcg($K, @gain)/$$pideal{$topic};
	$err = &err ($K, @gain);
	$ndcg = riskWeighted($ndcg,$baseNDCG,$riskAlpha) if ($isRiskSensitive);
	$err = riskWeighted($err,$baseERR,$riskAlpha) if ($isRiskSensitive);
	$$pndcgTotal += $ndcg;
	$$perrTotal += $err;
	$$ptopics++;
	printf("$runid,$topic,%.5f,%.5f\n",$ndcg,$err) if ($printTopic);
	return ($ndcg,$err);
	}
	}

	sub processRun
	{
	my ($run,$printTopics,$pseen,$pideal,$pjudgment,$avgOverAllTopics,$isRiskSensitive,$riskAlpha,$baserunname,$baseNDCGByTopic,$baseERRByTopic) = @_;
	my $ndcgByTopic = {()};
	my $errByTopic = {()};
	my $runid = "?????";
	my @run = ();
	# Read run rile, check format, and sort
	open (RUN,"<$run") \|\| die "$0: cannot open \"$run\": $!\n";
	while (<RUN>) {
	s/[\r\n]//g;
	my ($topic, $q0, $docno, $rank, $score);
	($topic, $q0, $docno, $rank, $score, $runid) = split (' ');
	$topic =~ s/^.*\-//;
	die "$0: format error on line $. of \"$run\"\n"
	unless
	$topic =~ /^[0-9]+$/ && $q0 eq "Q0" && $rank =~ /^[0-9]+$/ && $runid;
	$run[$#run + 1] = "$topic $docno $score";
	}

	@run = sort runOrder (@run);

	my %processed = ();
	foreach my $topic (%$pseen)
	{
	$processed{$topic} = 0;
	}

	if ($isRiskSensitive)
	{
	$runid = sprintf("%s (rel to. %s, rs=1+a, a=%s)",$runid,$baserunname,$riskAlpha);
	}

	# Process runs: compute measures for each topic and average
	my $ndcgTotal = 0;
	my $errTotal = 0;
	my $topics = 0;
	print "runid,topic,ndcg\@$K,err\@$K\n" if ($printTopics);
	my $topicCurrent = -1;
	my @gain = ();
	for (my $i = 0; $i <= $#run; $i++) {
	my ($topic, $docno, $score) = split (' ', $run[$i]);
	if ($topic != $topicCurrent) {
	if ($topicCurrent >= 0) {
	my ($baseNDCG,$baseERR) = 0;
	if ($isRiskSensitive)
	{
	$baseNDCG = $$baseNDCGByTopic{$topicCurrent} if (exists($$baseNDCGByTopic{$topicCurrent}) and defined($$baseNDCGByTopic{$topicCurrent}));
	$baseERR = $$baseERRByTopic{$topicCurrent} if (exists($$baseERRByTopic{$topicCurrent}) and defined($$baseERRByTopic{$topicCurrent}));
	}
	my ($ndcg,$err) = &topicDone ($printTopics, $runid, $topicCurrent, \$ndcgTotal, \$errTotal, \$topics,
	$pseen, $pideal, $isRiskSensitive, $riskAlpha, $baseNDCG, $baseERR, @gain);
	$$ndcgByTopic{$topicCurrent} = $ndcg;
	$$errByTopic{$topicCurrent} = $err;
	$processed{$topicCurrent} = 1;
	$#gain = -1;
	}
	$topicCurrent = $topic;
	}
	my $j = $$pjudgment{"$topic:$docno"};
	$j = 0 unless $j;
	$gain[$#gain + 1] = $j;
	}
	if ($topicCurrent >= 0) {
	my ($baseNDCG,$baseERR) = 0;
	if ($isRiskSensitive)
	{
	$baseNDCG = $$baseNDCGByTopic{$topicCurrent} if (exists($$baseNDCGByTopic{$topicCurrent}) and defined($$baseNDCGByTopic{$topicCurrent}));
	$baseERR = $$baseERRByTopic{$topicCurrent} if (exists($$baseERRByTopic{$topicCurrent}) and defined($$baseERRByTopic{$topicCurrent}));
	}
	my ($ndcg,$err) = &topicDone ($printTopics, $runid, $topicCurrent, \$ndcgTotal, \$errTotal, \$topics,
	$pseen, $pideal, $isRiskSensitive, $riskAlpha, $baseNDCG, $baseERR, @gain);
	$$ndcgByTopic{$topicCurrent} = $ndcg;
	$$errByTopic{$topicCurrent} = $err;
	$processed{$topicCurrent} = 1;
	$#gain = -1;
	}
	my $numTopics = $topics; # $topics has the number in the run (at this point)
	if ($avgOverAllTopics)
	{
	$numTopics = scalar(keys %$pseen); # we want denominator to change whenever flag is on but only need to compute differences for risk
	if ($isRiskSensitive)
	{ # need to process any topics that were missing from run
	my ($baseNDCG,$baseERR) = 0;
	my @gain = ();
	foreach my $topicCurrent (sort {$a <=> $b} keys %processed)
	{
	next if ($processed{$topicCurrent});
	$baseNDCG = $$baseNDCGByTopic{$topicCurrent} if (exists($$baseNDCGByTopic{$topicCurrent}) and defined($$baseNDCGByTopic{$topicCurrent}));
	$baseERR = $$baseERRByTopic{$topicCurrent} if (exists($$baseERRByTopic{$topicCurrent}) and defined($$baseERRByTopic{$topicCurrent}));
	my ($ndcg,$err) = &topicDone ($printTopics, $runid, $topicCurrent, \$ndcgTotal, \$errTotal, \$topics,
	$pseen, $pideal, $isRiskSensitive, $riskAlpha, $baseNDCG, $baseERR, @gain);
	}
	}
	}

	my $ndcgAvg = $ndcgTotal;
	my $errAvg = $errTotal;
	if ($numTopics > 0)
	{
	$ndcgAvg /= $numTopics;
	$errAvg /= $numTopics;
	}
	printf "$runid,amean,%.5f,%.5f\n",$ndcgAvg,$errAvg if ($printTopics);

	return ($ndcgByTopic,$errByTopic,$runid);
	close(RUN);
	}