Spaces:
Sleeping
Sleeping
Add my custom leaderboard files
Browse files- .ipynb_checkpoints/app-checkpoint.py +224 -0
- src/envs.py +13 -5
.ipynb_checkpoints/app-checkpoint.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
+
from huggingface_hub import snapshot_download
|
| 6 |
+
from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
|
| 7 |
+
from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
|
| 8 |
+
from src.display.css_html_js import custom_css
|
| 9 |
+
from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
|
| 10 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 11 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 12 |
+
from src.submission.submit import add_new_eval
|
| 13 |
+
import random
|
| 14 |
+
|
| 15 |
+
# Define task metadata (icons, names, descriptions)
|
| 16 |
+
TASK_METADATA_MULTIPLECHOICE = {
|
| 17 |
+
# "TE": {"icon": "π", "name": "Textual Entailment", "tooltip": ""},
|
| 18 |
+
# "SA": {"icon": "π", "name": "Sentiment Analysis", "tooltip": ""},
|
| 19 |
+
# "HS": {"icon": "β οΈ", "name": "Hate Speech", "tooltip": ""},
|
| 20 |
+
# "AT": {"icon": "π₯", "name": "Admission Test", "tooltip": ""},
|
| 21 |
+
# "WIC": {"icon": "π€", "name": "Word in Context", "tooltip": ""},
|
| 22 |
+
# "FAQ": {"icon": "β", "name": "Frequently Asked Questions", "tooltip": ""}
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# Define task metadata (icons, names, descriptions)
|
| 26 |
+
TASK_METADATA_GENERATIVE = {
|
| 27 |
+
# "LS": {"icon": "π", "name": "Lexical Substitution", "tooltip": ""},
|
| 28 |
+
# "SU": {"icon": "π", "name": "Summarization", "tooltip": ""},
|
| 29 |
+
"NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
| 30 |
+
"REL": {"icon": "π", "name": "Relation Extraction", "tooltip": ""},
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
def restart_space():
|
| 34 |
+
"""Restart the Hugging Face space."""
|
| 35 |
+
API.restart_space(repo_id=REPO_ID)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
| 39 |
+
"""
|
| 40 |
+
Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected.
|
| 41 |
+
The table is sorted based on the "Avg. Combined Performance" field.
|
| 42 |
+
"""
|
| 43 |
+
if dataframe is None or dataframe.empty:
|
| 44 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 45 |
+
|
| 46 |
+
field_list = fields(AutoEvalColumn)
|
| 47 |
+
|
| 48 |
+
return Leaderboard(
|
| 49 |
+
value=dataframe,
|
| 50 |
+
datatype=[c.type for c in field_list],
|
| 51 |
+
#select_columns=SelectColumns(
|
| 52 |
+
# default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
|
| 53 |
+
# cant_deselect=[c.name for c in field_list if c.never_hidden],
|
| 54 |
+
# label="Select Columns to Display:",
|
| 55 |
+
#),
|
| 56 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 57 |
+
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 58 |
+
filter_columns=[
|
| 59 |
+
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
|
| 60 |
+
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "),
|
| 61 |
+
# ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
|
| 62 |
+
],
|
| 63 |
+
#filter_columns=[
|
| 64 |
+
# ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
|
| 65 |
+
# #ColumnFilter("FS", type="dropdown", label="5-Few-Shot")
|
| 66 |
+
#],
|
| 67 |
+
bool_checkboxgroup_label="Evaluation Mode",
|
| 68 |
+
interactive=False,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
| 72 |
+
"""
|
| 73 |
+
Update and return the leaderboard when a specific task is selected.
|
| 74 |
+
The table is sorted based on the "Combined Performance" field.
|
| 75 |
+
"""
|
| 76 |
+
if dataframe is None or dataframe.empty:
|
| 77 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 78 |
+
print ("-----------")
|
| 79 |
+
print(dataframe)
|
| 80 |
+
print("columns : ", dataframe.columns)
|
| 81 |
+
print ("-----------")
|
| 82 |
+
|
| 83 |
+
#sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
|
| 84 |
+
sorted_dataframe = dataframe.sort_values(by="Avg. Combined Performance β¬οΈ", ascending=False)
|
| 85 |
+
|
| 86 |
+
#print(sorted_dataframe['Combined Performance'])
|
| 87 |
+
|
| 88 |
+
field_list = fields(AutoEvalColumn)
|
| 89 |
+
|
| 90 |
+
return Leaderboard(
|
| 91 |
+
value=sorted_dataframe,
|
| 92 |
+
datatype=[c.type for c in field_list],
|
| 93 |
+
#select_columns=SelectColumns(
|
| 94 |
+
# default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
|
| 95 |
+
# cant_deselect=[c.name for c in field_list if c.never_hidden],
|
| 96 |
+
# label="Select Columns to Display:",
|
| 97 |
+
#),
|
| 98 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 99 |
+
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 100 |
+
filter_columns=[
|
| 101 |
+
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
|
| 102 |
+
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "),
|
| 103 |
+
],
|
| 104 |
+
bool_checkboxgroup_label="Evaluation Mode",
|
| 105 |
+
interactive=False
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
'''
|
| 109 |
+
# Helper function for leaderboard initialization
|
| 110 |
+
def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
| 111 |
+
"""Initialize and return a leaderboard."""
|
| 112 |
+
if dataframe is None or dataframe.empty:
|
| 113 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 114 |
+
|
| 115 |
+
return Leaderboard(
|
| 116 |
+
value=dataframe,
|
| 117 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 118 |
+
select_columns=SelectColumns(
|
| 119 |
+
default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 120 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 121 |
+
label="Select Columns to Display:",
|
| 122 |
+
),
|
| 123 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 124 |
+
hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 125 |
+
filter_columns=[
|
| 126 |
+
ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
|
| 127 |
+
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
|
| 128 |
+
],
|
| 129 |
+
bool_checkboxgroup_label="Hide models",
|
| 130 |
+
interactive=False,
|
| 131 |
+
)
|
| 132 |
+
'''
|
| 133 |
+
|
| 134 |
+
def download_snapshot(repo, local_dir):
|
| 135 |
+
"""Try to download a snapshot from Hugging Face Hub."""
|
| 136 |
+
try:
|
| 137 |
+
print(f"Downloading from {repo} to {local_dir}...")
|
| 138 |
+
snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"Error downloading {repo}: {e}")
|
| 141 |
+
restart_space()
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# Initialize the app by downloading snapshots
|
| 145 |
+
#download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
| 146 |
+
#download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
| 147 |
+
|
| 148 |
+
# Load leaderboard data
|
| 149 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 150 |
+
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 151 |
+
|
| 152 |
+
# Prepare the main interface
|
| 153 |
+
demo = gr.Blocks(css=custom_css)
|
| 154 |
+
with demo:
|
| 155 |
+
gr.HTML(TITLE)
|
| 156 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 157 |
+
|
| 158 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 159 |
+
|
| 160 |
+
# Main leaderboard tab
|
| 161 |
+
with gr.TabItem("π
Benchmark"):
|
| 162 |
+
|
| 163 |
+
leaderboard = init_leaderboard(
|
| 164 |
+
LEADERBOARD_DF,
|
| 165 |
+
default_selection=['LANG','FS', 'Model', "Avg. Combined Performance β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
|
| 166 |
+
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['LANG','FS', 'Model', "Avg. Combined Performance β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# About tab
|
| 170 |
+
with gr.TabItem("π About"):
|
| 171 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 172 |
+
|
| 173 |
+
# About tab
|
| 174 |
+
with gr.TabItem("β", interactive=False):
|
| 175 |
+
gr.Markdown("", elem_classes="markdown-text")
|
| 176 |
+
|
| 177 |
+
# Task-specific leaderboards
|
| 178 |
+
for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
|
| 179 |
+
|
| 180 |
+
with gr.TabItem(f"{metadata['icon']}{task}"):
|
| 181 |
+
|
| 182 |
+
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
| 183 |
+
gr.Markdown(task_description, elem_classes="markdown-text")
|
| 184 |
+
|
| 185 |
+
leaderboard = update_task_leaderboard(
|
| 186 |
+
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
|
| 187 |
+
default_selection=['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
|
| 188 |
+
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# About tab
|
| 192 |
+
with gr.TabItem("β", interactive=False):
|
| 193 |
+
gr.Markdown("", elem_classes="markdown-text")
|
| 194 |
+
|
| 195 |
+
# Task-specific leaderboards
|
| 196 |
+
for task, metadata in TASK_METADATA_GENERATIVE.items():
|
| 197 |
+
with gr.TabItem(f"{metadata['icon']}{task}"):
|
| 198 |
+
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
| 199 |
+
gr.Markdown(task_description, elem_classes="markdown-text")
|
| 200 |
+
|
| 201 |
+
leaderboard = update_task_leaderboard(
|
| 202 |
+
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
|
| 203 |
+
f"{task} Best Prompt": "Best Prompt",
|
| 204 |
+
f"{task} Best Prompt Id": "Best Prompt Id",
|
| 205 |
+
task: "Combined Performance"}),
|
| 206 |
+
default_selection=['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
|
| 207 |
+
'Best Prompt Id'],
|
| 208 |
+
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
| 209 |
+
col not in ['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average',
|
| 210 |
+
'Best Prompt', 'Best Prompt Id']]
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# Citation section
|
| 214 |
+
with gr.Accordion("π Citation", open=False):
|
| 215 |
+
gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
|
| 216 |
+
|
| 217 |
+
# Background job to restart space
|
| 218 |
+
scheduler = BackgroundScheduler()
|
| 219 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 220 |
+
scheduler.start()
|
| 221 |
+
|
| 222 |
+
# Launch the app with concurrent queueing
|
| 223 |
+
demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode
|
| 224 |
+
show_error=True)
|
src/envs.py
CHANGED
|
@@ -14,7 +14,7 @@ OWNER = "saeedfarzi"
|
|
| 14 |
#QUEUE_REPO = f"{OWNER}/evalita-requests"
|
| 15 |
#RESULTS_REPO = f"{OWNER}/evalita-results"
|
| 16 |
|
| 17 |
-
REPO_ID = f"{OWNER}/
|
| 18 |
QUEUE_REPO = f"{OWNER}/e3c_llm_requests"
|
| 19 |
RESULTS_REPO = f"{OWNER}/e3c_llm_results"
|
| 20 |
|
|
@@ -27,10 +27,18 @@ RESULTS_REPO = f"{OWNER}/e3c_llm_results"
|
|
| 27 |
#EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 28 |
#EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 29 |
|
| 30 |
-
EVAL_REQUESTS_PATH ='/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue")
|
| 31 |
-
EVAL_RESULTS_PATH = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results'#os.path.join(CACHE_PATH, "eval-results")
|
| 32 |
-
EVAL_REQUESTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 33 |
-
EVAL_RESULTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results' #os.path.join(CACHE_PATH, "eval-results-bk")
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
API = HfApi(token=TOKEN)
|
|
|
|
| 14 |
#QUEUE_REPO = f"{OWNER}/evalita-requests"
|
| 15 |
#RESULTS_REPO = f"{OWNER}/evalita-results"
|
| 16 |
|
| 17 |
+
REPO_ID = f"{OWNER}/MediLingua_Leaderboard"
|
| 18 |
QUEUE_REPO = f"{OWNER}/e3c_llm_requests"
|
| 19 |
RESULTS_REPO = f"{OWNER}/e3c_llm_results"
|
| 20 |
|
|
|
|
| 27 |
#EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 28 |
#EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 29 |
|
| 30 |
+
#EVAL_REQUESTS_PATH ='/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue")
|
| 31 |
+
#EVAL_RESULTS_PATH = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results'#os.path.join(CACHE_PATH, "eval-results")
|
| 32 |
+
#EVAL_REQUESTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 33 |
+
#EVAL_RESULTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results' #os.path.join(CACHE_PATH, "eval-results-bk")
|
| 34 |
|
| 35 |
+
# Assuming app.py is in the same directory as these folders:
|
| 36 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 37 |
+
|
| 38 |
+
# Use relative paths so they work on Hugging Face as well
|
| 39 |
+
EVAL_REQUESTS_PATH = os.path.join(BASE_DIR, "e3c_llm_requests")
|
| 40 |
+
EVAL_RESULTS_PATH = os.path.join(BASE_DIR, "e3c_llm_results")
|
| 41 |
+
EVAL_REQUESTS_PATH_BACKEND = EVAL_REQUESTS_PATH
|
| 42 |
+
EVAL_RESULTS_PATH_BACKEND = EVAL_RESULTS_PATH
|
| 43 |
|
| 44 |
API = HfApi(token=TOKEN)
|