RAG_Eval / tests /test_stats.py
Rom89823974978's picture
Updated and resolved issues
3b8840f
import numpy as np
import pytest
from evaluation.stats import (
corr_ci,
wilcoxon_signed_rank,
holm_bonferroni,
delta_metric,
conditional_failure_rate,
chi2_error_propagation,
)
def test_corr_ci():
x = np.arange(10)
y = np.arange(10) + np.random.normal(scale=1e-6, size=10)
rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000, ci=0.90)
assert -1 <= rho <= 1
assert 0 <= lo <= hi <= 1
assert 0 <= p <= 1
def test_wilcoxon():
x = [1, 2, 3]
y = [1, 3, 5]
_, p = wilcoxon_signed_rank(x, y)
assert 0 <= p <= 1 # only smoke-check that p is a valid probability
def test_holm():
raw = {"a": 0.01, "b": 0.04, "c": 0.20}
adj = holm_bonferroni(raw)
# For m=3, sorted raw = [0.01,0.04,0.20]
# a_adj = 3*0.01=0.03; b_adj = 2*0.04=0.08; c_adj = 1*0.20=0.20
assert adj["a"]==pytest.approx(0.03, rel=1e-6)
assert adj["b"]==pytest.approx(0.08, rel=1e-6)
assert adj["c"]==pytest.approx(0.2, rel=1e-6)
import pytest
import math
import numpy as np
from evaluation.stats.robustness import delta_metric, conditional_failure_rate
def test_delta_and_failure_rate():
orig = [0.9, 0.8, 0.7]
pert = [0.85, 0.75, 0.65]
delta, cohen_d = delta_metric(orig, pert)
assert isinstance(delta, float)
assert pytest.approx(-0.05, rel=1e-6) == delta
assert isinstance(cohen_d, float)
assert pytest.approx(-0.5, rel=1e-6) == cohen_d
with pytest.raises(ValueError):
delta_metric([1.0, 2.0], [1.0])
retrieval_errors = [0, 1, 0, 1]
hallucinations = [1, 0, 0, 1]
rates = conditional_failure_rate(retrieval_errors, hallucinations)
assert set(rates.keys()) == {
"p_hallucination_given_error",
"p_hallucination_given_success",
}
assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_error"]
assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_success"]
only_success = [0, 0, 0]
hall2 = [1, 1, 0]
rates2 = conditional_failure_rate(only_success, hall2)
assert math.isnan(rates2["p_hallucination_given_error"])
assert pytest.approx(2 / 3, rel=1e-6) == rates2["p_hallucination_given_success"]
with pytest.raises(ValueError):
conditional_failure_rate([0, 1], [1])
def test_chi2_error_propagation():
arr1 = [10, 20, 30]
arr2 = [15, 25, 35]
err = chi2_error_propagation(arr1, arr2)
assert isinstance(err, dict)
assert isinstance(err.get("chi2"), float)
assert isinstance(err.get("p"), float)