Sfarzi commited on
Commit
318d554
Β·
1 Parent(s): b9301cc

Add my custom leaderboard files

Browse files
Files changed (2) hide show
  1. .ipynb_checkpoints/app-checkpoint.py +224 -0
  2. src/envs.py +13 -5
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
7
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
8
+ from src.display.css_html_js import custom_css
9
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
10
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
+ from src.submission.submit import add_new_eval
13
+ import random
14
+
15
+ # Define task metadata (icons, names, descriptions)
16
+ TASK_METADATA_MULTIPLECHOICE = {
17
+ # "TE": {"icon": "πŸ“Š", "name": "Textual Entailment", "tooltip": ""},
18
+ # "SA": {"icon": "πŸ˜ƒ", "name": "Sentiment Analysis", "tooltip": ""},
19
+ # "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
20
+ # "AT": {"icon": "πŸ₯", "name": "Admission Test", "tooltip": ""},
21
+ # "WIC": {"icon": "πŸ”€", "name": "Word in Context", "tooltip": ""},
22
+ # "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
23
+ }
24
+
25
+ # Define task metadata (icons, names, descriptions)
26
+ TASK_METADATA_GENERATIVE = {
27
+ # "LS": {"icon": "πŸ”„", "name": "Lexical Substitution", "tooltip": ""},
28
+ # "SU": {"icon": "πŸ“", "name": "Summarization", "tooltip": ""},
29
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
30
+ "REL": {"icon": "πŸ”—", "name": "Relation Extraction", "tooltip": ""},
31
+ }
32
+
33
+ def restart_space():
34
+ """Restart the Hugging Face space."""
35
+ API.restart_space(repo_id=REPO_ID)
36
+
37
+
38
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
39
+ """
40
+ Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected.
41
+ The table is sorted based on the "Avg. Combined Performance" field.
42
+ """
43
+ if dataframe is None or dataframe.empty:
44
+ raise ValueError("Leaderboard DataFrame is empty or None.")
45
+
46
+ field_list = fields(AutoEvalColumn)
47
+
48
+ return Leaderboard(
49
+ value=dataframe,
50
+ datatype=[c.type for c in field_list],
51
+ #select_columns=SelectColumns(
52
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
53
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
54
+ # label="Select Columns to Display:",
55
+ #),
56
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
57
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
58
+ filter_columns=[
59
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
60
+ ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "),
61
+ # ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
62
+ ],
63
+ #filter_columns=[
64
+ # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
65
+ # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot")
66
+ #],
67
+ bool_checkboxgroup_label="Evaluation Mode",
68
+ interactive=False,
69
+ )
70
+
71
+ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
72
+ """
73
+ Update and return the leaderboard when a specific task is selected.
74
+ The table is sorted based on the "Combined Performance" field.
75
+ """
76
+ if dataframe is None or dataframe.empty:
77
+ raise ValueError("Leaderboard DataFrame is empty or None.")
78
+ print ("-----------")
79
+ print(dataframe)
80
+ print("columns : ", dataframe.columns)
81
+ print ("-----------")
82
+
83
+ #sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
84
+ sorted_dataframe = dataframe.sort_values(by="Avg. Combined Performance ⬆️", ascending=False)
85
+
86
+ #print(sorted_dataframe['Combined Performance'])
87
+
88
+ field_list = fields(AutoEvalColumn)
89
+
90
+ return Leaderboard(
91
+ value=sorted_dataframe,
92
+ datatype=[c.type for c in field_list],
93
+ #select_columns=SelectColumns(
94
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
95
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
96
+ # label="Select Columns to Display:",
97
+ #),
98
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
99
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
100
+ filter_columns=[
101
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
102
+ ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "),
103
+ ],
104
+ bool_checkboxgroup_label="Evaluation Mode",
105
+ interactive=False
106
+ )
107
+
108
+ '''
109
+ # Helper function for leaderboard initialization
110
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
111
+ """Initialize and return a leaderboard."""
112
+ if dataframe is None or dataframe.empty:
113
+ raise ValueError("Leaderboard DataFrame is empty or None.")
114
+
115
+ return Leaderboard(
116
+ value=dataframe,
117
+ datatype=[c.type for c in fields(AutoEvalColumn)],
118
+ select_columns=SelectColumns(
119
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
120
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
121
+ label="Select Columns to Display:",
122
+ ),
123
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
124
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
125
+ filter_columns=[
126
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
127
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
128
+ ],
129
+ bool_checkboxgroup_label="Hide models",
130
+ interactive=False,
131
+ )
132
+ '''
133
+
134
+ def download_snapshot(repo, local_dir):
135
+ """Try to download a snapshot from Hugging Face Hub."""
136
+ try:
137
+ print(f"Downloading from {repo} to {local_dir}...")
138
+ snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
139
+ except Exception as e:
140
+ print(f"Error downloading {repo}: {e}")
141
+ restart_space()
142
+
143
+
144
+ # Initialize the app by downloading snapshots
145
+ #download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
146
+ #download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
147
+
148
+ # Load leaderboard data
149
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
150
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
151
+
152
+ # Prepare the main interface
153
+ demo = gr.Blocks(css=custom_css)
154
+ with demo:
155
+ gr.HTML(TITLE)
156
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
157
+
158
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
159
+
160
+ # Main leaderboard tab
161
+ with gr.TabItem("πŸ… Benchmark"):
162
+
163
+ leaderboard = init_leaderboard(
164
+ LEADERBOARD_DF,
165
+ default_selection=['LANG','FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
166
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['LANG','FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
167
+ )
168
+
169
+ # About tab
170
+ with gr.TabItem("πŸ“ About"):
171
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
172
+
173
+ # About tab
174
+ with gr.TabItem("β•‘", interactive=False):
175
+ gr.Markdown("", elem_classes="markdown-text")
176
+
177
+ # Task-specific leaderboards
178
+ for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
179
+
180
+ with gr.TabItem(f"{metadata['icon']}{task}"):
181
+
182
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
183
+ gr.Markdown(task_description, elem_classes="markdown-text")
184
+
185
+ leaderboard = update_task_leaderboard(
186
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
187
+ default_selection=['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
188
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
189
+ )
190
+
191
+ # About tab
192
+ with gr.TabItem("β”‚", interactive=False):
193
+ gr.Markdown("", elem_classes="markdown-text")
194
+
195
+ # Task-specific leaderboards
196
+ for task, metadata in TASK_METADATA_GENERATIVE.items():
197
+ with gr.TabItem(f"{metadata['icon']}{task}"):
198
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
199
+ gr.Markdown(task_description, elem_classes="markdown-text")
200
+
201
+ leaderboard = update_task_leaderboard(
202
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
203
+ f"{task} Best Prompt": "Best Prompt",
204
+ f"{task} Best Prompt Id": "Best Prompt Id",
205
+ task: "Combined Performance"}),
206
+ default_selection=['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
207
+ 'Best Prompt Id'],
208
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
209
+ col not in ['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average',
210
+ 'Best Prompt', 'Best Prompt Id']]
211
+ )
212
+
213
+ # Citation section
214
+ with gr.Accordion("πŸ“™ Citation", open=False):
215
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
216
+
217
+ # Background job to restart space
218
+ scheduler = BackgroundScheduler()
219
+ scheduler.add_job(restart_space, "interval", seconds=1800)
220
+ scheduler.start()
221
+
222
+ # Launch the app with concurrent queueing
223
+ demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode
224
+ show_error=True)
src/envs.py CHANGED
@@ -14,7 +14,7 @@ OWNER = "saeedfarzi"
14
  #QUEUE_REPO = f"{OWNER}/evalita-requests"
15
  #RESULTS_REPO = f"{OWNER}/evalita-results"
16
 
17
- REPO_ID = f"{OWNER}/llm_leaderboard"
18
  QUEUE_REPO = f"{OWNER}/e3c_llm_requests"
19
  RESULTS_REPO = f"{OWNER}/e3c_llm_results"
20
 
@@ -27,10 +27,18 @@ RESULTS_REPO = f"{OWNER}/e3c_llm_results"
27
  #EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
28
  #EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
29
 
30
- EVAL_REQUESTS_PATH ='/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue")
31
- EVAL_RESULTS_PATH = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results'#os.path.join(CACHE_PATH, "eval-results")
32
- EVAL_REQUESTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue-bk")
33
- EVAL_RESULTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results' #os.path.join(CACHE_PATH, "eval-results-bk")
34
 
 
 
 
 
 
 
 
 
35
 
36
  API = HfApi(token=TOKEN)
 
14
  #QUEUE_REPO = f"{OWNER}/evalita-requests"
15
  #RESULTS_REPO = f"{OWNER}/evalita-results"
16
 
17
+ REPO_ID = f"{OWNER}/MediLingua_Leaderboard"
18
  QUEUE_REPO = f"{OWNER}/e3c_llm_requests"
19
  RESULTS_REPO = f"{OWNER}/e3c_llm_results"
20
 
 
27
  #EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
28
  #EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
29
 
30
+ #EVAL_REQUESTS_PATH ='/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue")
31
+ #EVAL_RESULTS_PATH = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results'#os.path.join(CACHE_PATH, "eval-results")
32
+ #EVAL_REQUESTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue-bk")
33
+ #EVAL_RESULTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results' #os.path.join(CACHE_PATH, "eval-results-bk")
34
 
35
+ # Assuming app.py is in the same directory as these folders:
36
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
37
+
38
+ # Use relative paths so they work on Hugging Face as well
39
+ EVAL_REQUESTS_PATH = os.path.join(BASE_DIR, "e3c_llm_requests")
40
+ EVAL_RESULTS_PATH = os.path.join(BASE_DIR, "e3c_llm_results")
41
+ EVAL_REQUESTS_PATH_BACKEND = EVAL_REQUESTS_PATH
42
+ EVAL_RESULTS_PATH_BACKEND = EVAL_RESULTS_PATH
43
 
44
  API = HfApi(token=TOKEN)