from dataloading import load_run_data from lmsim.metrics import Kappa_p def compute_similarity(selected_model_a, selected_model_b, selected_dataset): probs_a, gt_a = load_run_data(selected_model_a, selected_dataset) probs_b, gt_b = load_run_data(selected_model_b, selected_dataset) assert len(probs_a) == len(probs_b), "Models must have the same number of responses" # Only keep responses where the ground truth is the same output_a = [] output_b = [] gt = [] for i in range(len(probs_a)): if gt_a == gt_b: output_a.append(probs_a[i]) output_b.append(probs_b[i]) gt.append(gt_a[i]) # Placeholder similarity value kappa_p = Kappa_p() similarity = kappa_p.compute_k(output_a, output_b, gt) return similarity