import json import metrics import argparse import numpy as np import multiprocessing from tqdm import trange import signal, functools import re, os, sys, random, time from fraction import Fraction from data_processing.answer_extraction import * from functools import lru_cache from eval.eval_script import * MAX_INT = sys.maxsize INVALID_ANS = "[Invalid]" INF = 1e9 __all__ = [ "check_equal", "check_equal_without_timeout", "numberic_compare", "Evaluator", ] @lru_cache(maxsize=1000000) def check_equal_without_timeout(ans_1, ans_2): return math_equal(ans_1, ans_2) def check_equal(ans_1, ans_2, cache_dict=None): try: if cache_dict is not None: key = str(ans_1) + "<##>" + str(ans_2) if key in cache_dict: return cache_dict[key] print("Miss") return check_equal_without_timeout(ans_1, ans_2) except TimeoutError as e: return False def numberic_compare(ai, aj, ci, cj, cache_dict=None): return check_equal(ai, aj, cache_dict) def prep_evaluator( predicts, completions, perplexities, answer, equal_func, check_equal ): m = len(predicts) # Compute maximum probability max_perplexity = -INF max_perplexity_count = 0.0 for i in range(m): if perplexities[i] > max_perplexity: max_perplexity = perplexities[i] max_perplexity_count = 0.0 if perplexities[i] >= max_perplexity: max_perplexity_count += 1.0 # Compute accuracy correct, answers = 0, [] for i in range(m): ans_i = predicts[i] answers.append([ans_i, np.exp(perplexities[i]), check_equal(ans_i, answer)]) if perplexities[i] < max_perplexity: continue if check_equal(ans_i, answer): correct += 1.0 / max_perplexity_count return correct, answers class Evaluator: def __init__(self): self.name = "Perplexity" def process(self, json_file, cache_file, equal_func, evaluator, K, seed=0): # with open(file_path, 'r', encoding='utf-8') as f: # results = json.load(f) results = json_file n = len(results["predict"]) m = len(results["predict"][0]) indices = list(range(m)) random.seed(seed) random.shuffle(indices) indices = indices[: K] if cache_file is not None: def cache_equal_func(ai, aj, ci, cj): return equal_func(ai, aj, ci, cj, cache_file) def cache_check_equal(ai, aj): return check_equal(ai, aj, cache_file) else: cache_equal_func = equal_func cache_check_equal = check_equal predicts, completions, perplexities, answers = [], [], [], [] for i in range(0, n): predicts.append([results["predict"][i][j] for j in indices]) completions.append([results["completion"][i][j] for j in indices]) perplexities.append([results["mean_logprob"][i][j] for j in indices]) answers.append(results["answer"][i]) n = len(predicts) start_time = time.time() outputs = [] for idx in trange(n): res = evaluator( predicts[idx], completions[idx], perplexities[idx], answers[idx], cache_equal_func, cache_check_equal, ) outputs.append(res) print(f"Running Time with Single Process Mode with Seed #{seed}: {time.time() - start_time:.2f}S") for i in trange(n): m = len(outputs[i][1]) for j in range(m): ans, prob, flag = outputs[i][1][j] maximum, max_bins = metrics.compute_maximum_metrics([x[1] for x in outputs]) average, avg_bins = metrics.compute_average_metrics([x[1] for x in outputs]) accs = np.mean([x[0] for x in outputs]) return accs * 100.0, maximum, average, max_bins, avg_bins def worker(self, args): json_file, cache_file, K, seed = args acc, maximum, average, max_bins, avg_bins = self.process( json_file=json_file, cache_file=cache_file, equal_func=numberic_compare, evaluator=prep_evaluator, K=K, seed=seed ) return acc, maximum, average def solve(self, json_file, cache_file=None, repeats=10, K=128): accs, maxs, avgs = [], [], [] with multiprocessing.Pool() as pool: results = pool.map(self.worker, [(json_file, cache_file, K, seed) for seed in range(repeats)]) accs, maxs, _ = zip(*results) accs, maxs = np.array(accs), np.array(maxs) return { "Accuracy": f"{accs.mean():.2f} ± {accs.std():.2f}", "ECE": f"{maxs[:, 0].mean() * 100.0:.2f} ± {maxs[:, 0].std() * 100.0:.2f}", }