import pandas as pd import json import os import numpy as np import re import gradio as gr tasks = ["hellaswag", "arc_challenge", "hendrycks", "truthfulqa_mc"] validators = ["opentensor_foundation"] def clean_result(result, task): if ("hendrycks" in task): if ((len(result["result"]) <= 2) and (result["result"] != "") and (result["result"][0].isupper())) or ((result["result"] != "") and (re.match('[A-Z]\.', result["result"][:2]))): if result["result"][0] == "A": result["cleaned_result"] = "1" elif result["result"][0] == "B": result["cleaned_result"] = "2" elif result["result"][0] == "C": result["cleaned_result"] = "3" elif result["result"][0] == "D": result["cleaned_result"] = "4" else: result["cleaned_result"] = "N/A" else: result["cleaned_result"] = "N/A" elif (task == "truthfulqa_mc"): cleaned_result = [] for r in result['result']: if 'False' in r: cleaned_result.append(0) elif 'True' in r: cleaned_result.append(1) else: cleaned_result.append("N/A") result["cleaned_result"] = cleaned_result else: if (result["result"] != "") and (result["result"][0].isnumeric()): result["cleaned_result"] = result["result"][0] else: result["cleaned_result"] = "N/A" return result def mc2(doc): # Split on the first `0` as everything before it is true (`1`). split_idx = list(doc["mc2_targets"]["labels"]).index(0) lls = doc["cleaned_result"] # Compute the normalized probability mass for the correct answer. ll_true, ll_false = lls[:split_idx], lls[split_idx:] p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) p_true = p_true / (sum(p_true) + sum(p_false)) return sum(p_true) final_total_results = [] final_split_results = [] results_cumulative = [] for validator in validators: results_dir_file_list = os.listdir(f"""_results/few-shot/{validator}""") number_of_nas, number_of_results, inference_total = 0,0,0 for task in tasks: task_results_files = [result_file for result_file in results_dir_file_list if task in result_file] results = [] for task_results_file in task_results_files: results_file_dir = f"""_results/few-shot/{validator}/{task_results_file}""" f = open(results_file_dir) results += json.load(f) results = [clean_result(result, task) if "result" in result else result for result in results] results_cumulative += results # Total results number_of_nas += len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])]) inference_total += np.array([result['inference_time'] for result in results if 'inference_time' in result]).sum() number_of_results += len([1 for result in results if ('cleaned_result' in result)]) # Indiviudal results result_coverage = round((sum(['result' in result for result in results])/len(results))*100,2) na_coverage = round((len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])])/len(['result' in result for result in results]))*100,2) inference_avg = round(np.array([result['inference_time'] for result in results if 'inference_time' in result]).mean(), 2) if task == "truthfulqa_mc": metric = round(np.array([mc2(result) for result in results if ("cleaned_result" in result) and ("N/A" not in result["cleaned_result"])]).mean()*100,2) else: metric = round((len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") and (int(result["cleaned_result"]) == (int(result["gold"])+1))])/len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") ]))*100,2) final_split_results.append({ "task" : task, "coverage_%" :result_coverage, "na_%" : na_coverage, "inference_avg" : inference_avg, "metric" : metric }) print(final_split_results) final_total_results.append({ "Validator": validator.replace("_", " ").capitalize(), "N/A %" : round((number_of_nas/number_of_results)*100,2), "Avg Inference (s)" : round((inference_total/number_of_results),2), "Average ⬆️": 0, "ARC (25-shot) ⬆️": final_split_results[tasks.index("arc_challenge")]["metric"], "HellaSwag (10-shot) ⬆️": final_split_results[tasks.index("hellaswag")]["metric"], "MMLU (5-shot) ⬆️": final_split_results[tasks.index("hendrycks")]["metric"], "TruthfulQA (0-shot) ⬆️": final_split_results[tasks.index("truthfulqa_mc")]["metric"] }) final_total_results[-1]["Average ⬆️"] = np.array([final_total_results[0]["ARC (25-shot) ⬆️"], final_total_results[0]["HellaSwag (10-shot) ⬆️"],final_total_results[0]["TruthfulQA (0-shot) ⬆️"], final_total_results[0]["MMLU (5-shot) ⬆️"]]).mean() df = pd.DataFrame(results_cumulative) df = df[df["cleaned_result"] == "N/A"].groupby("result", as_index=False).count().sort_values(by = ["id"], ascending = False).head(10)[["result","id"]].rename(columns={"result": "Result", "id": "ID"}) demo = gr.Blocks() with demo: with gr.Row(): title = gr.Markdown(value=f"""#

Bittensor LMEH Leaderboard

""") with gr.Row(): table_1 = gr.Dataframe(pd.DataFrame(final_total_results)) with gr.Row(): title = gr.Markdown(value=f"""#

Analysis Of Top 10 N/A Responses

""") with gr.Row(): table_2 = gr.Dataframe(df) # with gr.Row(visible = False): # table_2 = gr.Dataframe(pd.DataFrame(final_split_results)) demo.queue(concurrency_count = 5) demo.launch(enable_queue=True, debug=True, server_name="0.0.0.0", server_port=7860)