Spaces:
Sleeping
Sleeping
import numpy as np | |
import pytest | |
from evaluation.stats import ( | |
corr_ci, | |
wilcoxon_signed_rank, | |
holm_bonferroni, | |
delta_metric, | |
conditional_failure_rate, | |
chi2_error_propagation, | |
) | |
def test_corr_ci(): | |
x = np.arange(10) | |
y = np.arange(10) + np.random.normal(scale=1e-6, size=10) | |
rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000, ci=0.90) | |
assert -1 <= rho <= 1 | |
assert 0 <= lo <= hi <= 1 | |
assert 0 <= p <= 1 | |
def test_wilcoxon(): | |
x = [1, 2, 3] | |
y = [1, 3, 5] | |
_, p = wilcoxon_signed_rank(x, y) | |
assert 0 <= p <= 1 # only smoke-check that p is a valid probability | |
def test_holm(): | |
raw = {"a": 0.01, "b": 0.04, "c": 0.20} | |
adj = holm_bonferroni(raw) | |
# For m=3, sorted raw = [0.01,0.04,0.20] | |
# a_adj = 3*0.01=0.03; b_adj = 2*0.04=0.08; c_adj = 1*0.20=0.20 | |
assert adj["a"]==pytest.approx(0.03, rel=1e-6) | |
assert adj["b"]==pytest.approx(0.08, rel=1e-6) | |
assert adj["c"]==pytest.approx(0.2, rel=1e-6) | |
import pytest | |
import math | |
import numpy as np | |
from evaluation.stats.robustness import delta_metric, conditional_failure_rate | |
def test_delta_and_failure_rate(): | |
orig = [0.9, 0.8, 0.7] | |
pert = [0.85, 0.75, 0.65] | |
delta, cohen_d = delta_metric(orig, pert) | |
assert isinstance(delta, float) | |
assert pytest.approx(-0.05, rel=1e-6) == delta | |
assert isinstance(cohen_d, float) | |
assert pytest.approx(-0.5, rel=1e-6) == cohen_d | |
with pytest.raises(ValueError): | |
delta_metric([1.0, 2.0], [1.0]) | |
retrieval_errors = [0, 1, 0, 1] | |
hallucinations = [1, 0, 0, 1] | |
rates = conditional_failure_rate(retrieval_errors, hallucinations) | |
assert set(rates.keys()) == { | |
"p_hallucination_given_error", | |
"p_hallucination_given_success", | |
} | |
assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_error"] | |
assert pytest.approx(0.5, rel=1e-6) == rates["p_hallucination_given_success"] | |
only_success = [0, 0, 0] | |
hall2 = [1, 1, 0] | |
rates2 = conditional_failure_rate(only_success, hall2) | |
assert math.isnan(rates2["p_hallucination_given_error"]) | |
assert pytest.approx(2 / 3, rel=1e-6) == rates2["p_hallucination_given_success"] | |
with pytest.raises(ValueError): | |
conditional_failure_rate([0, 1], [1]) | |
def test_chi2_error_propagation(): | |
arr1 = [10, 20, 30] | |
arr2 = [15, 25, 35] | |
err = chi2_error_propagation(arr1, arr2) | |
assert isinstance(err, dict) | |
assert isinstance(err.get("chi2"), float) | |
assert isinstance(err.get("p"), float) | |