Spaces:
Runtime error
Runtime error
import pandas as pd | |
import json | |
import os | |
import numpy as np | |
import re | |
import gradio as gr | |
tasks = ["hellaswag", "arc_challenge", "hendrycks", "truthfulqa_mc"] | |
validators = ["opentensor_foundation"] | |
def clean_result(result, task): | |
if ("hendrycks" in task): | |
if ((len(result["result"]) <= 2) and (result["result"] != "") and (result["result"][0].isupper())) or ((result["result"] != "") and (re.match('[A-Z]\.', result["result"][:2]))): | |
if result["result"][0] == "A": | |
result["cleaned_result"] = "1" | |
elif result["result"][0] == "B": | |
result["cleaned_result"] = "2" | |
elif result["result"][0] == "C": | |
result["cleaned_result"] = "3" | |
elif result["result"][0] == "D": | |
result["cleaned_result"] = "4" | |
else: | |
result["cleaned_result"] = "N/A" | |
else: | |
result["cleaned_result"] = "N/A" | |
elif (task == "truthfulqa_mc"): | |
cleaned_result = [] | |
for r in result['result']: | |
if 'False' in r: | |
cleaned_result.append(0) | |
elif 'True' in r: | |
cleaned_result.append(1) | |
else: | |
cleaned_result.append("N/A") | |
result["cleaned_result"] = cleaned_result | |
else: | |
if (result["result"] != "") and (result["result"][0].isnumeric()): | |
result["cleaned_result"] = result["result"][0] | |
else: | |
result["cleaned_result"] = "N/A" | |
return result | |
def mc2(doc): | |
# Split on the first `0` as everything before it is true (`1`). | |
split_idx = list(doc["mc2_targets"]["labels"]).index(0) | |
lls = doc["cleaned_result"] | |
# Compute the normalized probability mass for the correct answer. | |
ll_true, ll_false = lls[:split_idx], lls[split_idx:] | |
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) | |
p_true = p_true / (sum(p_true) + sum(p_false)) | |
return sum(p_true) | |
final_total_results = [] | |
final_split_results = [] | |
results_cumulative = [] | |
for validator in validators: | |
results_dir_file_list = os.listdir(f"""_results/few-shot/{validator}""") | |
number_of_nas, number_of_results, inference_total = 0,0,0 | |
for task in tasks: | |
task_results_files = [result_file for result_file in results_dir_file_list if task in result_file] | |
results = [] | |
for task_results_file in task_results_files: | |
results_file_dir = f"""_results/few-shot/{validator}/{task_results_file}""" | |
f = open(results_file_dir) | |
results += json.load(f) | |
results = [clean_result(result, task) if "result" in result else result for result in results] | |
results_cumulative += results | |
# Total results | |
number_of_nas += len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])]) | |
inference_total += np.array([result['inference_time'] for result in results if 'inference_time' in result]).sum() | |
number_of_results += len([1 for result in results if ('cleaned_result' in result)]) | |
# Indiviudal results | |
result_coverage = round((sum(['result' in result for result in results])/len(results))*100,2) | |
na_coverage = round((len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])])/len(['result' in result for result in results]))*100,2) | |
inference_avg = round(np.array([result['inference_time'] for result in results if 'inference_time' in result]).mean(), 2) | |
if task == "truthfulqa_mc": | |
metric = round(np.array([mc2(result) for result in results if ("cleaned_result" in result) and ("N/A" not in result["cleaned_result"])]).mean()*100,2) | |
else: | |
metric = round((len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") and (int(result["cleaned_result"]) == (int(result["gold"])+1))])/len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") ]))*100,2) | |
final_split_results.append({ | |
"task" : task, | |
"coverage_%" :result_coverage, | |
"na_%" : na_coverage, | |
"inference_avg" : inference_avg, | |
"metric" : metric | |
}) | |
print(final_split_results) | |
final_total_results.append({ | |
"Validator": validator.replace("_", " ").capitalize(), | |
"N/A %" : round((number_of_nas/number_of_results)*100,2), | |
"Avg Inference (s)" : round((inference_total/number_of_results),2), | |
"Average ⬆️": 0, | |
"ARC (25-shot) ⬆️": final_split_results[tasks.index("arc_challenge")]["metric"], | |
"HellaSwag (10-shot) ⬆️": final_split_results[tasks.index("hellaswag")]["metric"], | |
"MMLU (5-shot) ⬆️": final_split_results[tasks.index("hendrycks")]["metric"], | |
"TruthfulQA (0-shot) ⬆️": final_split_results[tasks.index("truthfulqa_mc")]["metric"] | |
}) | |
final_total_results[-1]["Average ⬆️"] = np.array([final_total_results[0]["ARC (25-shot) ⬆️"], final_total_results[0]["HellaSwag (10-shot) ⬆️"],final_total_results[0]["TruthfulQA (0-shot) ⬆️"], final_total_results[0]["MMLU (5-shot) ⬆️"]]).mean() | |
df = pd.DataFrame(results_cumulative) | |
df = df[df["cleaned_result"] == "N/A"].groupby("result", as_index=False).count().sort_values(by = ["id"], ascending = False).head(10)[["result","id"]].rename(columns={"result": "Result", "id": "ID"}) | |
demo = gr.Blocks() | |
with demo: | |
with gr.Row(): | |
title = gr.Markdown(value=f"""# <p style="text-align: center;"> Bittensor LMEH Leaderboard</p>""") | |
with gr.Row(): | |
table_1 = gr.Dataframe(pd.DataFrame(final_total_results)) | |
with gr.Row(): | |
title = gr.Markdown(value=f"""# <p style="text-align: center;"> Analysis Of Top 10 N/A Responses</p>""") | |
with gr.Row(): | |
table_2 = gr.Dataframe(df) | |
# with gr.Row(visible = False): | |
# table_2 = gr.Dataframe(pd.DataFrame(final_split_results)) | |
demo.queue(concurrency_count = 5) | |
demo.launch(enable_queue=True, debug=True, server_name="0.0.0.0", server_port=7860) |