Spaces:

devrajsinh2012
/

Mexar

Sleeping

Mexar / backend /evaluation /statistical_tests.py

devrajsinh2012

Deploy current project snapshot to Hugging Face Space

53bb779 about 1 month ago

2.12 kB

	"""
	Calculates McNemar's test for significance between two models,
	using the stated binarization threshold.
	"""
	import sys
	import os
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from modules.reasoning_engine import ReasoningEngine

	THRESHOLD = ReasoningEngine.MCNEMAR_BINARIZATION_THRESHOLD

	def mcnemars_test(scores_model_A: list, scores_model_B: list):
	"""
	Computes McNemar's test p-value for paired nominal data.
	scores are lists of float faithfulness scores.
	"""
	if len(scores_model_A) != len(scores_model_B):
	raise ValueError("Must have same number of scores")

	# Binarize
	bin_A = [1 if s >= THRESHOLD else 0 for s in scores_model_A]
	bin_B = [1 if s >= THRESHOLD else 0 for s in scores_model_B]

	# Contingency table
	# B correct \| B wrong
	# A correct \| a \| b
	# A wrong \| c \| d

	a, b, c, d = 0, 0, 0, 0
	for a_val, b_val in zip(bin_A, bin_B):
	if a_val == 1 and b_val == 1: a += 1
	elif a_val == 1 and b_val == 0: b += 1
	elif a_val == 0 and b_val == 1: c += 1
	else: d += 1

	# Chi-square statistic: (b - c)^2 / (b + c)
	if b + c == 0:
	print("Models are identical given the threshold.")
	return 1.0 # No difference

	chi_square = ((abs(b - c) - 1)**2) / (b + c) # with continuity correction

	print(f"McNemar's Test Results:")
	print(f"Binarization Threshold: {THRESHOLD}")
	print(f"Contingency Table: a={a}, b={b}, c={c}, d={d}")
	print(f"Chi-square: {chi_square:.3f}")

	try:
	from scipy.stats import chi2
	p_value = 1 - chi2.cdf(chi_square, 1)
	print(f"p-value: {p_value:.4f}")
	return p_value
	except ImportError:
	print("Note: Install scipy ('pip install scipy') to automatically calculate the p-value.")
	return chi_square

	if __name__ == "__main__":
	# Mock data
	scores_mexar = [0.8, 0.9, 0.4, 0.7, 0.65, 0.8]
	scores_baseline = [0.5, 0.7, 0.6, 0.4, 0.55, 0.8]
	mcnemars_test(scores_mexar, scores_baseline)