alibayram commited on
Commit
3b81b14
β€’
1 Parent(s): 2014dc9

Add Gradio interface for LLM benchmarking and evaluation submission

Browse files
Files changed (2) hide show
  1. app.py +73 -198
  2. app_ex.py +204 -0
app.py CHANGED
@@ -1,204 +1,79 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
  )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
-
91
-
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
- with gr.Row():
192
- with gr.Accordion("πŸ“™ Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import matplotlib.pyplot as plt
 
4
 
5
+ # Load datasets
6
+ leaderboard_data = pd.read_parquet(
7
+ "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_liderlik_tablosu/data/train-00000-of-00001.parquet"
 
 
 
 
8
  )
9
+ model_responses_data = pd.read_parquet(
10
+ "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_model_cevaplari/data/train-00000-of-00001.parquet"
11
+ )
12
+ section_results_data = pd.read_parquet(
13
+ "hf://datasets/alibayram/yapay_zeka_turkce_mmlu_bolum_sonuclari/data/train-00000-of-00001.parquet"
 
 
 
 
 
 
14
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Leaderboard Tab
17
+ def get_leaderboard(sort_by="Accuracy"):
18
+ return leaderboard_data.sort_values(by=sort_by, ascending=False)
19
+
20
+ # Model Responses Tab
21
+ def search_model_responses(query, model):
22
+ filtered = model_responses_data[
23
+ (model_responses_data["model"] == model) &
24
+ (model_responses_data["question"].str.contains(query, case=False))
25
+ ]
26
+ return filtered
27
+
28
+ # Section Results Tab
29
+ def plot_section_results():
30
+ fig, ax = plt.subplots(figsize=(10, 6))
31
+ section_results_data.groupby("section")["accuracy"].mean().plot(kind="bar", ax=ax)
32
+ ax.set_title("Section-Wise Performance")
33
+ ax.set_ylabel("Accuracy (%)")
34
+ ax.set_xlabel("Section")
35
+ return fig
36
+
37
+ # Model Comparison Tab
38
+ def compare_models(models):
39
+ comparison = leaderboard_data[leaderboard_data["model"].isin(models)]
40
+ return comparison
41
+
42
+ # Gradio Interface
43
+ with gr.Blocks() as app:
44
+ gr.Markdown("# πŸ† Turkish MMLU Leaderboard")
45
+ gr.Markdown("Explore the performance of AI models on Turkish MMLU benchmarks.")
46
+
47
+ with gr.Tab("Leaderboard"):
48
+ sort_by = gr.Dropdown(
49
+ ["Accuracy", "Runtime", "Model Name"],
50
+ label="Sort By",
51
+ value="Accuracy"
52
+ )
53
+ leaderboard_table = gr.DataFrame(value=leaderboard_data)
54
+ sort_by.change(get_leaderboard, inputs=sort_by, outputs=leaderboard_table)
55
+
56
+ with gr.Tab("Model Responses"):
57
+ model_dropdown = gr.Dropdown(
58
+ leaderboard_data["model"].unique(), label="Select Model"
59
+ )
60
+ query_input = gr.Textbox(label="Search Query")
61
+ responses_output = gr.DataFrame()
62
+ query_input.change(
63
+ search_model_responses,
64
+ inputs=[query_input, model_dropdown],
65
+ outputs=responses_output,
66
+ )
67
+
68
+ with gr.Tab("Section Results"):
69
+ gr.Markdown("### Section-Wise Results")
70
+ gr.Plot(plot_section_results)
71
+
72
+ with gr.Tab("Model Comparison"):
73
+ model_select = gr.CheckboxGroup(
74
+ options=leaderboard_data["model"].unique(), label="Select Models"
75
+ )
76
+ comparison_table = gr.DataFrame()
77
+ model_select.change(compare_models, inputs=model_select, outputs=comparison_table)
78
+
79
+ app.launch()
app_ex.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ BENCHMARK_COLS,
18
+ COLS,
19
+ EVAL_COLS,
20
+ EVAL_TYPES,
21
+ AutoEvalColumn,
22
+ ModelType,
23
+ fields,
24
+ WeightType,
25
+ Precision
26
+ )
27
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
+ from src.submission.submit import add_new_eval
30
+
31
+
32
+ def restart_space():
33
+ API.restart_space(repo_id=REPO_ID)
34
+
35
+ ### Space initialisation
36
+ try:
37
+ print(EVAL_REQUESTS_PATH)
38
+ snapshot_download(
39
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
+ )
41
+ except Exception:
42
+ restart_space()
43
+ try:
44
+ print(EVAL_RESULTS_PATH)
45
+ snapshot_download(
46
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
+ )
48
+ except Exception:
49
+ restart_space()
50
+
51
+
52
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
+
54
+ (
55
+ finished_eval_queue_df,
56
+ running_eval_queue_df,
57
+ pending_eval_queue_df,
58
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
+
60
+ def init_leaderboard(dataframe):
61
+ if dataframe is None or dataframe.empty:
62
+ raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ return Leaderboard(
64
+ value=dataframe,
65
+ datatype=[c.type for c in fields(AutoEvalColumn)],
66
+ select_columns=SelectColumns(
67
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
+ label="Select Columns to Display:",
70
+ ),
71
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
+ filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ ColumnFilter(
77
+ AutoEvalColumn.params.name,
78
+ type="slider",
79
+ min=0.01,
80
+ max=150,
81
+ label="Select the number of parameters (B)",
82
+ ),
83
+ ColumnFilter(
84
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ ),
86
+ ],
87
+ bool_checkboxgroup_label="Hide models",
88
+ interactive=False,
89
+ )
90
+
91
+
92
+ demo = gr.Blocks(css=custom_css)
93
+ with demo:
94
+ gr.HTML(TITLE)
95
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
+
97
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
100
+
101
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
+
104
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
+ with gr.Column():
106
+ with gr.Row():
107
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
+
109
+ with gr.Column():
110
+ with gr.Accordion(
111
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
+ open=False,
113
+ ):
114
+ with gr.Row():
115
+ finished_eval_table = gr.components.Dataframe(
116
+ value=finished_eval_queue_df,
117
+ headers=EVAL_COLS,
118
+ datatype=EVAL_TYPES,
119
+ row_count=5,
120
+ )
121
+ with gr.Accordion(
122
+ f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
+ open=False,
124
+ ):
125
+ with gr.Row():
126
+ running_eval_table = gr.components.Dataframe(
127
+ value=running_eval_queue_df,
128
+ headers=EVAL_COLS,
129
+ datatype=EVAL_TYPES,
130
+ row_count=5,
131
+ )
132
+
133
+ with gr.Accordion(
134
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
+ open=False,
136
+ ):
137
+ with gr.Row():
138
+ pending_eval_table = gr.components.Dataframe(
139
+ value=pending_eval_queue_df,
140
+ headers=EVAL_COLS,
141
+ datatype=EVAL_TYPES,
142
+ row_count=5,
143
+ )
144
+ with gr.Row():
145
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ model_name_textbox = gr.Textbox(label="Model name")
150
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
+ model_type = gr.Dropdown(
152
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
+ label="Model type",
154
+ multiselect=False,
155
+ value=None,
156
+ interactive=True,
157
+ )
158
+
159
+ with gr.Column():
160
+ precision = gr.Dropdown(
161
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
+ label="Precision",
163
+ multiselect=False,
164
+ value="float16",
165
+ interactive=True,
166
+ )
167
+ weight_type = gr.Dropdown(
168
+ choices=[i.value.name for i in WeightType],
169
+ label="Weights type",
170
+ multiselect=False,
171
+ value="Original",
172
+ interactive=True,
173
+ )
174
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
+
176
+ submit_button = gr.Button("Submit Eval")
177
+ submission_result = gr.Markdown()
178
+ submit_button.click(
179
+ add_new_eval,
180
+ [
181
+ model_name_textbox,
182
+ base_model_name_textbox,
183
+ revision_name_textbox,
184
+ precision,
185
+ weight_type,
186
+ model_type,
187
+ ],
188
+ submission_result,
189
+ )
190
+
191
+ with gr.Row():
192
+ with gr.Accordion("πŸ“™ Citation", open=False):
193
+ citation_button = gr.Textbox(
194
+ value=CITATION_BUTTON_TEXT,
195
+ label=CITATION_BUTTON_LABEL,
196
+ lines=20,
197
+ elem_id="citation-button",
198
+ show_copy_button=True,
199
+ )
200
+
201
+ scheduler = BackgroundScheduler()
202
+ scheduler.add_job(restart_space, "interval", seconds=1800)
203
+ scheduler.start()
204
+ demo.queue(default_concurrency_limit=40).launch()