Spaces:
Sleeping
Sleeping
| """ | |
| Calculates McNemar's test for significance between two models, | |
| using the stated binarization threshold. | |
| """ | |
| import sys | |
| import os | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from modules.reasoning_engine import ReasoningEngine | |
| THRESHOLD = ReasoningEngine.MCNEMAR_BINARIZATION_THRESHOLD | |
| def mcnemars_test(scores_model_A: list, scores_model_B: list): | |
| """ | |
| Computes McNemar's test p-value for paired nominal data. | |
| scores are lists of float faithfulness scores. | |
| """ | |
| if len(scores_model_A) != len(scores_model_B): | |
| raise ValueError("Must have same number of scores") | |
| # Binarize | |
| bin_A = [1 if s >= THRESHOLD else 0 for s in scores_model_A] | |
| bin_B = [1 if s >= THRESHOLD else 0 for s in scores_model_B] | |
| # Contingency table | |
| # B correct | B wrong | |
| # A correct | a | b | |
| # A wrong | c | d | |
| a, b, c, d = 0, 0, 0, 0 | |
| for a_val, b_val in zip(bin_A, bin_B): | |
| if a_val == 1 and b_val == 1: a += 1 | |
| elif a_val == 1 and b_val == 0: b += 1 | |
| elif a_val == 0 and b_val == 1: c += 1 | |
| else: d += 1 | |
| # Chi-square statistic: (b - c)^2 / (b + c) | |
| if b + c == 0: | |
| print("Models are identical given the threshold.") | |
| return 1.0 # No difference | |
| chi_square = ((abs(b - c) - 1)**2) / (b + c) # with continuity correction | |
| print(f"McNemar's Test Results:") | |
| print(f"Binarization Threshold: {THRESHOLD}") | |
| print(f"Contingency Table: a={a}, b={b}, c={c}, d={d}") | |
| print(f"Chi-square: {chi_square:.3f}") | |
| try: | |
| from scipy.stats import chi2 | |
| p_value = 1 - chi2.cdf(chi_square, 1) | |
| print(f"p-value: {p_value:.4f}") | |
| return p_value | |
| except ImportError: | |
| print("Note: Install scipy ('pip install scipy') to automatically calculate the p-value.") | |
| return chi_square | |
| if __name__ == "__main__": | |
| # Mock data | |
| scores_mexar = [0.8, 0.9, 0.4, 0.7, 0.65, 0.8] | |
| scores_baseline = [0.5, 0.7, 0.6, 0.4, 0.55, 0.8] | |
| mcnemars_test(scores_mexar, scores_baseline) | |