Spaces:

Rom89823974978
/

RAG_Eval

Sleeping

App Files Files Community

RAG_Eval / tests /test_stats.py

Rom89823974978

Updated and resolved issues

3b8840f 4 months ago

raw

history blame contribute delete

2.55 kB

	import numpy as np
	import pytest

	from evaluation.stats import (
	corr_ci,
	wilcoxon_signed_rank,
	holm_bonferroni,
	delta_metric,
	conditional_failure_rate,
	chi2_error_propagation,
	)


	def test_corr_ci():
	x = np.arange(10)
	y = np.arange(10) + np.random.normal(scale=1e-6, size=10)
	rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000, ci=0.90)
	assert -1 <= rho <= 1
	assert 0 <= lo <= hi <= 1
	assert 0 <= p <= 1


	def test_wilcoxon():
	x = [1, 2, 3]
	y = [1, 3, 5]
	_, p = wilcoxon_signed_rank(x, y)
	assert 0 <= p <= 1 # only smoke-check that p is a valid probability


	def test_holm():
	raw = {"a": 0.01, "b": 0.04, "c": 0.20}
	adj = holm_bonferroni(raw)
	# For m=3, sorted raw = [0.01,0.04,0.20]
	# a_adj = 30.01=0.03; b_adj = 20.04=0.08; c_adj = 1*0.20=0.20
	assert adj["a"]==pytest.approx(0.03, rel=1e-6)
	assert adj["b"]==pytest.approx(0.08, rel=1e-6)
	assert adj["c"]==pytest.approx(0.2, rel=1e-6)


	import pytest
	import math
	import numpy as np

	from evaluation.stats.robustness import delta_metric, conditional_failure_rate


	def test_delta_and_failure_rate():
	orig = [0.9, 0.8, 0.7]
	pert = [0.85, 0.75, 0.65]
	delta, cohen_d = delta_metric(orig, pert)

	assert isinstance(delta, float)
	assert pytest.approx(-0.05, rel=1e-6) == delta

	assert isinstance(cohen_d, float)
	assert pytest.approx(-0.5, rel=1e-6) == cohen_d

	with pytest.raises(ValueError):
	delta_metric([1.0, 2.0], [1.0])

	retrieval_errors = [0, 1, 0, 1]
	hallucinations = [1, 0, 0, 1]
	rates = conditional_failure_rate(retrieval_errors, hallucinations)

	assert set(rates.keys()) == {
	"p_hallucination_given_error",
	"p_hallucination_given_success",
	}

	assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_error"]

	assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_success"]

	only_success = [0, 0, 0]
	hall2 = [1, 1, 0]
	rates2 = conditional_failure_rate(only_success, hall2)
	assert math.isnan(rates2["p_hallucination_given_error"])
	assert pytest.approx(2 / 3, rel=1e-6) == rates2["p_hallucination_given_success"]

	with pytest.raises(ValueError):
	conditional_failure_rate([0, 1], [1])



	def test_chi2_error_propagation():
	arr1 = [10, 20, 30]
	arr2 = [15, 25, 35]
	err = chi2_error_propagation(arr1, arr2)
	assert isinstance(err, dict)
	assert isinstance(err.get("chi2"), float)
	assert isinstance(err.get("p"), float)