kmfoda's picture
Edit app.py
a5c5dd3
raw
history blame
6.18 kB
import pandas as pd
import json
import os
import numpy as np
import re
import gradio as gr
tasks = ["hellaswag", "arc_challenge", "hendrycks", "truthfulqa_mc"]
validators = ["opentensor_foundation"]
def clean_result(result, task):
if ("hendrycks" in task):
if ((len(result["result"]) <= 2) and (result["result"] != "") and (result["result"][0].isupper())) or ((result["result"] != "") and (re.match('[A-Z]\.', result["result"][:2]))):
if result["result"][0] == "A":
result["cleaned_result"] = "1"
elif result["result"][0] == "B":
result["cleaned_result"] = "2"
elif result["result"][0] == "C":
result["cleaned_result"] = "3"
elif result["result"][0] == "D":
result["cleaned_result"] = "4"
else:
result["cleaned_result"] = "N/A"
else:
result["cleaned_result"] = "N/A"
elif (task == "truthfulqa_mc"):
cleaned_result = []
for r in result['result']:
if 'False' in r:
cleaned_result.append(0)
elif 'True' in r:
cleaned_result.append(1)
else:
cleaned_result.append("N/A")
result["cleaned_result"] = cleaned_result
else:
if (result["result"] != "") and (result["result"][0].isnumeric()):
result["cleaned_result"] = result["result"][0]
else:
result["cleaned_result"] = "N/A"
return result
def mc2(doc):
# Split on the first `0` as everything before it is true (`1`).
split_idx = list(doc["mc2_targets"]["labels"]).index(0)
lls = doc["cleaned_result"]
# Compute the normalized probability mass for the correct answer.
ll_true, ll_false = lls[:split_idx], lls[split_idx:]
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
p_true = p_true / (sum(p_true) + sum(p_false))
return sum(p_true)
final_total_results = []
final_split_results = []
results_cumulative = []
for validator in validators:
results_dir_file_list = os.listdir(f"""_results/few-shot/{validator}""")
number_of_nas, number_of_results, inference_total = 0,0,0
for task in tasks:
task_results_files = [result_file for result_file in results_dir_file_list if task in result_file]
results = []
for task_results_file in task_results_files:
results_file_dir = f"""_results/few-shot/{validator}/{task_results_file}"""
f = open(results_file_dir)
results += json.load(f)
results = [clean_result(result, task) if "result" in result else result for result in results]
results_cumulative += results
# Total results
number_of_nas += len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])])
inference_total += np.array([result['inference_time'] for result in results if 'inference_time' in result]).sum()
number_of_results += len([1 for result in results if ('cleaned_result' in result)])
# Indiviudal results
result_coverage = round((sum(['result' in result for result in results])/len(results))*100,2)
na_coverage = round((len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])])/len(['result' in result for result in results]))*100,2)
inference_avg = round(np.array([result['inference_time'] for result in results if 'inference_time' in result]).mean(), 2)
if task == "truthfulqa_mc":
metric = round(np.array([mc2(result) for result in results if ("cleaned_result" in result) and ("N/A" not in result["cleaned_result"])]).mean()*100,2)
else:
metric = round((len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") and (int(result["cleaned_result"]) == (int(result["gold"])+1))])/len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") ]))*100,2)
final_split_results.append({
"task" : task,
"coverage_%" :result_coverage,
"na_%" : na_coverage,
"inference_avg" : inference_avg,
"metric" : metric
})
print(final_split_results)
final_total_results.append({
"Validator": validator.replace("_", " ").capitalize(),
"N/A %" : round((number_of_nas/number_of_results)*100,2),
"Avg Inference (s)" : round((inference_total/number_of_results),2),
"Average ⬆️": 0,
"ARC (25-shot) ⬆️": final_split_results[tasks.index("arc_challenge")]["metric"],
"HellaSwag (10-shot) ⬆️": final_split_results[tasks.index("hellaswag")]["metric"],
"MMLU (5-shot) ⬆️": final_split_results[tasks.index("hendrycks")]["metric"],
"TruthfulQA (0-shot) ⬆️": final_split_results[tasks.index("truthfulqa_mc")]["metric"]
})
final_total_results[-1]["Average ⬆️"] = np.array([final_total_results[0]["ARC (25-shot) ⬆️"], final_total_results[0]["HellaSwag (10-shot) ⬆️"],final_total_results[0]["TruthfulQA (0-shot) ⬆️"], final_total_results[0]["MMLU (5-shot) ⬆️"]]).mean()
df = pd.DataFrame(results_cumulative)
df = df[df["cleaned_result"] == "N/A"].groupby("result", as_index=False).count().sort_values(by = ["id"], ascending = False).head(10)[["result","id"]].rename(columns={"result": "Result", "id": "ID"})
demo = gr.Blocks()
with demo:
with gr.Row():
title = gr.Markdown(value=f"""# <p style="text-align: center;"> Bittensor LMEH Leaderboard</p>""")
with gr.Row():
table_1 = gr.Dataframe(pd.DataFrame(final_total_results))
with gr.Row():
title = gr.Markdown(value=f"""# <p style="text-align: center;"> Analysis Of Top 10 N/A Responses</p>""")
with gr.Row():
table_2 = gr.Dataframe(df)
# with gr.Row(visible = False):
# table_2 = gr.Dataframe(pd.DataFrame(final_split_results))
demo.queue(concurrency_count = 5)
demo.launch(enable_queue=True, debug=True, server_name="0.0.0.0", server_port=7860)