|  | import json | 
					
						
						|  | from pathlib import Path | 
					
						
						|  |  | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import pandas as pd | 
					
						
						|  |  | 
					
						
						|  | TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>""" | 
					
						
						|  |  | 
					
						
						|  | DESCRIPTION = f""" | 
					
						
						|  | Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_leaderboard_df(): | 
					
						
						|  | filepaths = list(Path("eval_results").rglob("*.json")) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | models = set() | 
					
						
						|  | for filepath in filepaths: | 
					
						
						|  | path_parts = Path(filepath).parts | 
					
						
						|  | model_revision = "_".join(path_parts[1:4]) | 
					
						
						|  | models.add(model_revision) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df = pd.DataFrame(index=list(models)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for filepath in filepaths: | 
					
						
						|  | path_parts = Path(filepath).parts | 
					
						
						|  | model_revision = "_".join(path_parts[1:4]) | 
					
						
						|  | task = path_parts[4].capitalize() | 
					
						
						|  |  | 
					
						
						|  | timestamp = filepath.stem.split("_")[-1][:-3] | 
					
						
						|  | df.loc[model_revision, "Timestamp"] = timestamp | 
					
						
						|  |  | 
					
						
						|  | with open(filepath, "r") as file: | 
					
						
						|  | data = json.load(file) | 
					
						
						|  | first_result_key = next(iter(data["results"])) | 
					
						
						|  |  | 
					
						
						|  | if task.lower() == "truthfulqa": | 
					
						
						|  | value = data["results"][first_result_key]["truthfulqa_mc2"] | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() == "ifeval": | 
					
						
						|  | value = data["results"][first_result_key]["prompt_level_loose_acc"] | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() == "mmlu": | 
					
						
						|  | value = data["results"]["lighteval|mmlu:_average|5"]["acc"] | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() in ["hellaswag", "arc"]: | 
					
						
						|  | value = data["results"][first_result_key]["acc_norm"] | 
					
						
						|  | else: | 
					
						
						|  | first_metric_key = next( | 
					
						
						|  | iter(data["results"][first_result_key]) | 
					
						
						|  | ) | 
					
						
						|  | value = data["results"][first_result_key][first_metric_key] | 
					
						
						|  | df.loc[model_revision, task] = value | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | ifeval_col = df.pop("Ifeval") | 
					
						
						|  | df.insert(1, "Ifeval", ifeval_col) | 
					
						
						|  | df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True)) | 
					
						
						|  |  | 
					
						
						|  | df[df.select_dtypes(include=["number"]).columns] *= 100.0 | 
					
						
						|  | df = df.sort_values(by=["Average"], ascending=False) | 
					
						
						|  | df = df.reset_index().rename(columns={"index": "Model"}).round(2) | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def refresh(): | 
					
						
						|  | return get_leaderboard_df() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def update_table(search_query): | 
					
						
						|  | df = get_leaderboard_df() | 
					
						
						|  | if search_query: | 
					
						
						|  | search_terms = search_query.split(";") | 
					
						
						|  | search_terms = [term.strip() for term in search_terms] | 
					
						
						|  | pattern = "|".join(search_terms) | 
					
						
						|  | df = df[df["Model"].str.contains(pattern, regex=True)] | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | leaderboard_df = get_leaderboard_df() | 
					
						
						|  |  | 
					
						
						|  | demo = gr.Blocks() | 
					
						
						|  |  | 
					
						
						|  | with demo: | 
					
						
						|  | gr.HTML(TITLE) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown(DESCRIPTION, elem_classes="markdown-text") | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) | 
					
						
						|  | with gr.Group(): | 
					
						
						|  | leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | refresh_button = gr.Button("Refresh") | 
					
						
						|  |  | 
					
						
						|  | search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table]) | 
					
						
						|  | refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table]) | 
					
						
						|  |  | 
					
						
						|  | demo.launch() | 
					
						
						|  |  |