add open-source
Browse files- .gitignore +0 -1
- app.py +107 -109
- eval-results/.gitattributes +55 -0
- eval-results/demo-leaderboard/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/demo-leaderboard/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/demo-leaderboard/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/demo-leaderboard/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/demo-leaderboard/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/demo-leaderboard/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/demo-leaderboard/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/demo-leaderboard/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/demo-leaderboard/qwen2-72b_bge-large-zh/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/demo-leaderboard/qwen2-72b_bge-m3/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/demo-leaderboard/qwen2-72b_e5-mistral-7b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/demo-leaderboard/qwen2-72b_gte-qwen2-1.5b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/demo-leaderboard/qwen2-72b_jina-zh/results_2023-12-08 15:46:20.425378.json +35 -0
- src/about.py +21 -3
- src/display/formatting.py +3 -1
- src/display/utils.py +33 -32
- src/leaderboard/read_evals.py +66 -43
- src/populate.py +1 -0
.gitignore
CHANGED
@@ -7,7 +7,6 @@ __pycache__/
|
|
7 |
.vscode/
|
8 |
|
9 |
eval-queue/
|
10 |
-
eval-results/
|
11 |
eval-queue-bk/
|
12 |
eval-results-bk/
|
13 |
logs/
|
|
|
7 |
.vscode/
|
8 |
|
9 |
eval-queue/
|
|
|
10 |
eval-queue-bk/
|
11 |
eval-results-bk/
|
12 |
logs/
|
app.py
CHANGED
@@ -35,27 +35,27 @@ def restart_space():
|
|
35 |
### Space initialisation
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
-
snapshot_download(
|
39 |
-
|
40 |
-
)
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
try:
|
44 |
print(EVAL_RESULTS_PATH)
|
45 |
-
snapshot_download(
|
46 |
-
|
47 |
-
)
|
48 |
except Exception:
|
49 |
restart_space()
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
|
54 |
-
(
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
def init_leaderboard(dataframe):
|
61 |
if dataframe is None or dataframe.empty:
|
@@ -68,21 +68,21 @@ def init_leaderboard(dataframe):
|
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
-
search_columns=[AutoEvalColumn.
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
|
85 |
-
),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
@@ -101,92 +101,90 @@ with demo:
|
|
101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
submission_result,
|
189 |
-
)
|
190 |
|
191 |
with gr.Row():
|
192 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
35 |
### Space initialisation
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
+
# snapshot_download(
|
39 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
+
# )
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
try:
|
44 |
print(EVAL_RESULTS_PATH)
|
45 |
+
# snapshot_download(
|
46 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
+
# )
|
48 |
except Exception:
|
49 |
restart_space()
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
|
54 |
+
# (
|
55 |
+
# finished_eval_queue_df,
|
56 |
+
# running_eval_queue_df,
|
57 |
+
# pending_eval_queue_df,
|
58 |
+
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
def init_leaderboard(dataframe):
|
61 |
if dataframe is None or dataframe.empty:
|
|
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
+
search_columns=[AutoEvalColumn.generation_model.name, AutoEvalColumn.retrieval_model.name],
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
+
# ColumnFilter(
|
77 |
+
# AutoEvalColumn.params.name,
|
78 |
+
# type="slider",
|
79 |
+
# min=0.01,
|
80 |
+
# max=150,
|
81 |
+
# label="Select the number of parameters (B)",
|
82 |
+
# ),
|
83 |
+
# ColumnFilter(
|
84 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
+
# ),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
|
|
101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
+
# with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
+
# with gr.Column():
|
106 |
+
# with gr.Row():
|
107 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
+
#
|
109 |
+
# with gr.Column():
|
110 |
+
# with gr.Accordion(
|
111 |
+
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
+
# open=False,
|
113 |
+
# ):
|
114 |
+
# with gr.Row():
|
115 |
+
# finished_eval_table = gr.components.Dataframe(
|
116 |
+
# value=finished_eval_queue_df,
|
117 |
+
# headers=EVAL_COLS,
|
118 |
+
# datatype=EVAL_TYPES,
|
119 |
+
# row_count=5,
|
120 |
+
# )
|
121 |
+
# with gr.Accordion(
|
122 |
+
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
+
# open=False,
|
124 |
+
# ):
|
125 |
+
# with gr.Row():
|
126 |
+
# running_eval_table = gr.components.Dataframe(
|
127 |
+
# value=running_eval_queue_df,
|
128 |
+
# headers=EVAL_COLS,
|
129 |
+
# datatype=EVAL_TYPES,
|
130 |
+
# row_count=5,
|
131 |
+
# )
|
132 |
+
#
|
133 |
+
# with gr.Accordion(
|
134 |
+
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
+
# open=False,
|
136 |
+
# ):
|
137 |
+
# with gr.Row():
|
138 |
+
# pending_eval_table = gr.components.Dataframe(
|
139 |
+
# value=pending_eval_queue_df,
|
140 |
+
# headers=EVAL_COLS,
|
141 |
+
# datatype=EVAL_TYPES,
|
142 |
+
# row_count=5,
|
143 |
+
# )
|
144 |
+
# with gr.Row():
|
145 |
+
# gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
+
#
|
147 |
+
# with gr.Row():
|
148 |
+
# with gr.Column():
|
149 |
+
# model_name_textbox = gr.Textbox(label="Model name")
|
150 |
+
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
+
# model_type = gr.Dropdown(
|
152 |
+
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
+
# label="Model type",
|
154 |
+
# multiselect=False,
|
155 |
+
# value=None,
|
156 |
+
# interactive=True,
|
157 |
+
# )
|
158 |
+
#
|
159 |
+
# with gr.Column():
|
160 |
+
# precision = gr.Dropdown(
|
161 |
+
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
+
# label="Precision",
|
163 |
+
# multiselect=False,
|
164 |
+
# value="float16",
|
165 |
+
# interactive=True,
|
166 |
+
# )
|
167 |
+
# weight_type = gr.Dropdown(
|
168 |
+
# choices=[i.value.name for i in WeightType],
|
169 |
+
# label="Weights type",
|
170 |
+
# multiselect=False,
|
171 |
+
# value="Original",
|
172 |
+
# interactive=True,
|
173 |
+
# )
|
174 |
+
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
+
# eval_result=gr.Textbox(label="Eval Result")
|
176 |
+
#
|
177 |
+
# submit_button = gr.Button("Submit Eval")
|
178 |
+
# submission_result = gr.Markdown()
|
179 |
+
# submit_button.click(
|
180 |
+
# add_new_eval,
|
181 |
+
# [
|
182 |
+
# model_name_textbox,
|
183 |
+
# revision_name_textbox,
|
184 |
+
# model_type,
|
185 |
+
# ],
|
186 |
+
# submission_result,
|
187 |
+
# )
|
|
|
|
|
188 |
|
189 |
with gr.Row():
|
190 |
with gr.Accordion("📙 Citation", open=False):
|
eval-results/.gitattributes
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
eval-results/demo-leaderboard/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.0,
|
5 |
+
"map": 0.0
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.002277904328018223,
|
9 |
+
"f1": 0.43525331778147214,
|
10 |
+
"rouge1": 0.3150681120081669,
|
11 |
+
"rouge2": 0.12933954114035873,
|
12 |
+
"rougeL": 0.22495384062408755,
|
13 |
+
"accuracy": 0.33058086560364464,
|
14 |
+
"completeness": 0.5540647198105761,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.11534391534391535
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "CLOSE_deepseek-v2-chat",
|
22 |
+
"generation_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
25 |
+
"num_params": 80,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "CLOSE",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"num_params": 0.0,
|
31 |
+
"open_source": false
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
eval-results/demo-leaderboard/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.0,
|
5 |
+
"map": 0.0
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0005694760820045558,
|
9 |
+
"f1": 0.418257314239393,
|
10 |
+
"rouge1": 0.3061411048446855,
|
11 |
+
"rouge2": 0.12053616693649026,
|
12 |
+
"rougeL": 0.21948810430155005,
|
13 |
+
"accuracy": 0.285876993166287,
|
14 |
+
"completeness": 0.5132605304212169,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.06589958158995816
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "CLOSE_llama3-70b-instruct",
|
22 |
+
"generation_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
25 |
+
"num_params": 70,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "CLOSE",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"num_params": 0.0,
|
31 |
+
"open_source": false
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
eval-results/demo-leaderboard/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.0,
|
5 |
+
"map": 0.0
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0005694760820045558,
|
9 |
+
"f1": 0.4191216882279184,
|
10 |
+
"rouge1": 0.2989940495432677,
|
11 |
+
"rouge2": 0.12047626678426614,
|
12 |
+
"rougeL": 0.2082230205185154,
|
13 |
+
"accuracy": 0.34054669703872437,
|
14 |
+
"completeness": 0.5753690753690753,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.12406417112299466
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "CLOSE_qwen2-72b",
|
22 |
+
"generation_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "CLOSE",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"num_params": 0.0,
|
31 |
+
"open_source": false
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
eval-results/demo-leaderboard/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.0,
|
5 |
+
"map": 0.0
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0,
|
9 |
+
"f1": 0.11613328937628616,
|
10 |
+
"rouge1": 0.15613267640197273,
|
11 |
+
"rouge2": 0.04591153663411247,
|
12 |
+
"rougeL": 0.0496843687172552,
|
13 |
+
"accuracy": 0.14607061503416857,
|
14 |
+
"completeness": 0.4987157534246575,
|
15 |
+
"hallucination": 0.0,
|
16 |
+
"utilization": 0.0,
|
17 |
+
"numerical_accuracy": 0.0748663101604278
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "CLOSE_yi15-34b",
|
22 |
+
"generation_model": "01ai/Yi-1.5-34B-Chat-16K",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
25 |
+
"num_params": 34,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "CLOSE",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"num_params": 0.0,
|
31 |
+
"open_source": false
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
eval-results/demo-leaderboard/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.36173120728929387,
|
5 |
+
"map": 0.3512338648443432
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0056947608200455585,
|
9 |
+
"f1": 0.4212862409737785,
|
10 |
+
"rouge1": 0.3707328288930376,
|
11 |
+
"rouge2": 0.21393113234607009,
|
12 |
+
"rougeL": 0.2719847145278759,
|
13 |
+
"accuracy": 0.3886674259681093,
|
14 |
+
"completeness": 0.5858823529411765,
|
15 |
+
"hallucination": 0.07893209518282066,
|
16 |
+
"utilization": 0.48166472642607683,
|
17 |
+
"numerical_accuracy": 0.27365491651205937
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
|
22 |
+
"generation_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "deepseek-ai/DeepSeek-V2-Chat-0628",
|
25 |
+
"num_params": 80,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.5,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/demo-leaderboard/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.36173120728929387,
|
5 |
+
"map": 0.3512338648443432
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.04555808656036447,
|
9 |
+
"f1": 0.4907954247383474,
|
10 |
+
"rouge1": 0.4080491070348775,
|
11 |
+
"rouge2": 0.23130474174425783,
|
12 |
+
"rougeL": 0.3217574785678875,
|
13 |
+
"accuracy": 0.4216970387243736,
|
14 |
+
"completeness": 0.5688146380270486,
|
15 |
+
"hallucination": 0.11832946635730858,
|
16 |
+
"utilization": 0.4491869918699187,
|
17 |
+
"numerical_accuracy": 0.288981288981289
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
|
22 |
+
"generation_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
25 |
+
"num_params": 70,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.5,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/demo-leaderboard/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.36173120728929387,
|
5 |
+
"map": 0.3512338648443432
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.002277904328018223,
|
9 |
+
"f1": 0.3804001391052641,
|
10 |
+
"rouge1": 0.34576336184459094,
|
11 |
+
"rouge2": 0.1928778762677512,
|
12 |
+
"rougeL": 0.2383694455084706,
|
13 |
+
"accuracy": 0.4145785876993166,
|
14 |
+
"completeness": 0.598297213622291,
|
15 |
+
"hallucination": 0.07213496218731821,
|
16 |
+
"utilization": 1.13922942206655,
|
17 |
+
"numerical_accuracy": 0.3218694885361552
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_qwen2-72b",
|
22 |
+
"generation_model": "Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
+
"num_params": 72,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.5,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/demo-leaderboard/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.36173120728929387,
|
5 |
+
"map": 0.3512338648443432
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0,
|
9 |
+
"f1": 0.16041349053275844,
|
10 |
+
"rouge1": 0.21775697114621573,
|
11 |
+
"rouge2": 0.09738983880706074,
|
12 |
+
"rougeL": 0.08775246194460379,
|
13 |
+
"accuracy": 0.3211845102505695,
|
14 |
+
"completeness": 0.5703789636504254,
|
15 |
+
"hallucination": 0.07665094339622641,
|
16 |
+
"utilization": 0.40828402366863903,
|
17 |
+
"numerical_accuracy": 0.162
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_yi15-34b",
|
22 |
+
"generation_model": "01ai/Yi-1.5-34B-Chat-16K",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "01ai/Yi-1.5-34B-Chat-16K",
|
25 |
+
"num_params": 34,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.5,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/demo-leaderboard/qwen2-72b_bge-large-zh/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.3426063022019742,
|
5 |
+
"map": 0.33500379650721335
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0017084282460136675,
|
9 |
+
"f1": 0.3797528411547138,
|
10 |
+
"rouge1": 0.3372893350582966,
|
11 |
+
"rouge2": 0.18329984910669803,
|
12 |
+
"rougeL": 0.23230144566069125,
|
13 |
+
"accuracy": 0.40888382687927105,
|
14 |
+
"completeness": 0.6021044427123928,
|
15 |
+
"hallucination": 0.0023391812865497076,
|
16 |
+
"utilization": 0.5014637002341921,
|
17 |
+
"numerical_accuracy": 0.3100358422939068
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "bge-large-zh_bge-large-zh",
|
22 |
+
"generation_model": "BAAI/bge-large-zh",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "BAAI/bge-large-zh",
|
25 |
+
"num_params": 0.2,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "BAAI/bge-large-zh",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "BAAI/bge-large-zh",
|
31 |
+
"num_params": 0.2,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/demo-leaderboard/qwen2-72b_bge-m3/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.3527809415337889,
|
5 |
+
"map": 0.3458855353075171
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.0017084282460136675,
|
9 |
+
"f1": 0.38645032979631466,
|
10 |
+
"rouge1": 0.3467267951634575,
|
11 |
+
"rouge2": 0.1930581604826183,
|
12 |
+
"rougeL": 0.24141093461883717,
|
13 |
+
"accuracy": 0.4271070615034169,
|
14 |
+
"completeness": 0.6119287374128582,
|
15 |
+
"hallucination": 0.0005847953216374269,
|
16 |
+
"utilization": 0.5400116822429907,
|
17 |
+
"numerical_accuracy": 0.3372093023255814
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "bge-m3_bge-m3",
|
22 |
+
"generation_model": "BAAI/bge-m3",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "BAAI/bge-m3",
|
25 |
+
"num_params": 0.2,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "BAAI/bge-m3",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "BAAI/bge-m3",
|
31 |
+
"num_params": 0.2,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/demo-leaderboard/qwen2-72b_e5-mistral-7b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.303246013667426,
|
5 |
+
"map": 0.2960516324981017
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.002277904328018223,
|
9 |
+
"f1": 0.3705164550873997,
|
10 |
+
"rouge1": 0.3270311806826159,
|
11 |
+
"rouge2": 0.17476659877087528,
|
12 |
+
"rougeL": 0.22225645997479143,
|
13 |
+
"accuracy": 0.385250569476082,
|
14 |
+
"completeness": 0.5877535101404057,
|
15 |
+
"hallucination": 1.2922719349215572,
|
16 |
+
"utilization": 0.4793244030285381,
|
17 |
+
"numerical_accuracy": 0.28622540250447226
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "e5-mistral-7b_e5-mistral-7b",
|
22 |
+
"generation_model": "intfloat/e5-mistral-7b-instruct",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "intfloat/e5-mistral-7b-instruct",
|
25 |
+
"num_params": 7,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "intfloat/e5-mistral-7b-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "intfloat/e5-mistral-7b-instruct",
|
31 |
+
"num_params": 7,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/demo-leaderboard/qwen2-72b_gte-qwen2-1.5b/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.36173120728929387,
|
5 |
+
"map": 0.3512338648443432
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.002277904328018223,
|
9 |
+
"f1": 0.3804001391052641,
|
10 |
+
"rouge1": 0.34576336184459094,
|
11 |
+
"rouge2": 0.1928778762677512,
|
12 |
+
"rougeL": 0.2383694455084706,
|
13 |
+
"accuracy": 0.4145785876993166,
|
14 |
+
"completeness": 0.598297213622291,
|
15 |
+
"hallucination": 0.0011627906976744186,
|
16 |
+
"utilization": 1.13922942206655,
|
17 |
+
"numerical_accuracy": 0.3218694885361552
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "gte-qwen2-1.5b_gte-qwen2-1.5b",
|
22 |
+
"generation_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
25 |
+
"num_params": 1.5,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
31 |
+
"num_params": 1.5,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
eval-results/demo-leaderboard/qwen2-72b_jina-zh/results_2023-12-08 15:46:20.425378.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"retrieval": {
|
4 |
+
"mrr": 0.27484813971146543,
|
5 |
+
"map": 0.26924354593773725
|
6 |
+
},
|
7 |
+
"generation": {
|
8 |
+
"em": 0.003416856492027335,
|
9 |
+
"f1": 0.37960439080933656,
|
10 |
+
"rouge1": 0.3255380867320351,
|
11 |
+
"rouge2": 0.1732248556904568,
|
12 |
+
"rougeL": 0.22591939162851002,
|
13 |
+
"accuracy": 0.3826879271070615,
|
14 |
+
"completeness": 0.5793588741204065,
|
15 |
+
"hallucination": 0.0017381228273464658,
|
16 |
+
"utilization": 0.4855072463768116,
|
17 |
+
"numerical_accuracy": 0.2663594470046083
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"eval_name": "jina-zh_jina-zh",
|
22 |
+
"generation_model": "jinaai/reader-lm-0.5b",
|
23 |
+
"generation_model_args": {
|
24 |
+
"name": "jinaai/reader-lm-0.5b",
|
25 |
+
"num_params": 0.2,
|
26 |
+
"open_source": true
|
27 |
+
},
|
28 |
+
"retrieval_model": "jinaai/reader-lm-0.5b",
|
29 |
+
"retrieval_model_args": {
|
30 |
+
"name": "jinaai/reader-lm-0.5b",
|
31 |
+
"num_params": 0.2,
|
32 |
+
"open_source": true
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
src/about.py
CHANGED
@@ -12,8 +12,26 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,7 +39,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
# task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
+
|
18 |
+
# retrieval tasks
|
19 |
+
mrr = Task("retrieval", "mrr", "MRR")
|
20 |
+
map = Task("retrieval", "map", "MAP")
|
21 |
+
|
22 |
+
# generation tasks
|
23 |
+
em = Task("generation", "em", "EM")
|
24 |
+
f1 = Task("generation", "f1", "F1")
|
25 |
+
rouge1 = Task("generation", "rouge1", "Rouge-1")
|
26 |
+
rouge2 = Task("generation", "rouge2", "Rouge-2")
|
27 |
+
rougeL = Task("generation", "rougeL", "Rouge-L")
|
28 |
+
|
29 |
+
accuracy = Task("generation", "accuracy", "ACC")
|
30 |
+
completeness = Task("generation", "completeness", "COMP")
|
31 |
+
hallucination = Task("generation", "hallucination", "HAL")
|
32 |
+
utilization = Task("generation", "utilization", "UTIL")
|
33 |
+
numerical_accuracy = Task("generation", "numerical_accuracy", "MACC")
|
34 |
+
|
35 |
|
36 |
NUM_FEWSHOT = 0 # Change with your few shot
|
37 |
# ---------------------------------------------------
|
|
|
39 |
|
40 |
|
41 |
# Your leaderboard name
|
42 |
+
TITLE = """<h1 align="center" id="space-title">Fin Benchmark leaderboard</h1>"""
|
43 |
|
44 |
# What does your leaderboard evaluate?
|
45 |
INTRODUCTION_TEXT = """
|
src/display/formatting.py
CHANGED
@@ -2,7 +2,9 @@ def model_hyperlink(link, model_name):
|
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
4 |
|
5 |
-
def make_clickable_model(model_name):
|
|
|
|
|
6 |
link = f"https://huggingface.co/{model_name}"
|
7 |
return model_hyperlink(link, model_name)
|
8 |
|
|
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
4 |
|
5 |
+
def make_clickable_model(model_name, model_link=None):
|
6 |
+
if model_link:
|
7 |
+
return model_hyperlink(model_link, model_name)
|
8 |
link = f"https://huggingface.co/{model_name}"
|
9 |
return model_hyperlink(link, model_name)
|
10 |
|
src/display/utils.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
@@ -23,35 +21,42 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("
|
27 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
|
|
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["
|
39 |
-
auto_eval_column_dict.append(["
|
40 |
-
auto_eval_column_dict.append(["
|
41 |
-
auto_eval_column_dict.append(["
|
|
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
45 |
|
46 |
## For the queue columns in the submission tab
|
47 |
-
@dataclass(frozen=True)
|
48 |
-
class EvalQueueColumn: # Queue column
|
49 |
-
|
50 |
-
revision = ColumnContent("revision", "str", True)
|
51 |
-
private = ColumnContent("private", "bool", True)
|
52 |
-
precision = ColumnContent("precision", "str", True)
|
53 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
-
status = ColumnContent("status", "str", True)
|
|
|
55 |
|
56 |
## All the model information that we might need
|
57 |
@dataclass
|
@@ -62,10 +67,10 @@ class ModelDetails:
|
|
62 |
|
63 |
|
64 |
class ModelType(Enum):
|
65 |
-
|
66 |
-
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
67 |
-
|
68 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
69 |
Unknown = ModelDetails(name="", symbol="?")
|
70 |
|
71 |
def to_str(self, separator=" "):
|
@@ -73,14 +78,10 @@ class ModelType(Enum):
|
|
73 |
|
74 |
@staticmethod
|
75 |
def from_str(type):
|
76 |
-
if "
|
77 |
-
return ModelType.
|
78 |
-
if "
|
79 |
-
return ModelType.
|
80 |
-
if "RL-tuned" in type or "🟦" in type:
|
81 |
-
return ModelType.RL
|
82 |
-
if "instruction-tuned" in type or "⭕" in type:
|
83 |
-
return ModelType.IFT
|
84 |
return ModelType.Unknown
|
85 |
|
86 |
class WeightType(Enum):
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
|
|
|
|
4 |
from src.about import Tasks
|
5 |
|
6 |
def fields(raw_class):
|
|
|
21 |
## Leaderboard columns
|
22 |
auto_eval_column_dict = []
|
23 |
# Init
|
24 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Type Symbol", "str", True, never_hidden=True)])
|
25 |
+
# auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
26 |
+
auto_eval_column_dict.append(["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)])
|
27 |
+
auto_eval_column_dict.append(["generation_model", ColumnContent, ColumnContent("Generation Model", "markdown", True, never_hidden=True)])
|
28 |
+
|
29 |
#Scores
|
30 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
+
|
34 |
# Model information
|
35 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
36 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
37 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
38 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
39 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
40 |
+
auto_eval_column_dict.append(["gen_num_params", ColumnContent, ColumnContent("Gen#Params (B)", "number", False)])
|
41 |
+
auto_eval_column_dict.append(["ret_num_params", ColumnContent, ColumnContent("Ret#Params (B)", "number", False)])
|
42 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
43 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
44 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
45 |
|
46 |
# We use make dataclass to dynamically fill the scores from Tasks
|
47 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
48 |
+
EvalQueueColumn = make_dataclass("EvalQueueColumn", auto_eval_column_dict, frozen=True)
|
49 |
|
50 |
## For the queue columns in the submission tab
|
51 |
+
# @dataclass(frozen=True)
|
52 |
+
# class EvalQueueColumn: # Queue column
|
53 |
+
# model = ColumnContent("model", "markdown", True)
|
54 |
+
# revision = ColumnContent("revision", "str", True)
|
55 |
+
# private = ColumnContent("private", "bool", True)
|
56 |
+
# precision = ColumnContent("precision", "str", True)
|
57 |
+
# weight_type = ColumnContent("weight_type", "str", "Original")
|
58 |
+
# status = ColumnContent("status", "str", True)
|
59 |
+
|
60 |
|
61 |
## All the model information that we might need
|
62 |
@dataclass
|
|
|
67 |
|
68 |
|
69 |
class ModelType(Enum):
|
70 |
+
OpenSource = ModelDetails(name="open-source", symbol="🟢")
|
71 |
+
# FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
72 |
+
ClosedSource = ModelDetails(name="closed-source", symbol="⭕")
|
73 |
+
# RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
74 |
Unknown = ModelDetails(name="", symbol="?")
|
75 |
|
76 |
def to_str(self, separator=" "):
|
|
|
78 |
|
79 |
@staticmethod
|
80 |
def from_str(type):
|
81 |
+
if "open-source" in type or "🟢" in type:
|
82 |
+
return ModelType.OpenSource
|
83 |
+
if "closed-source" in type or "⭕" in type:
|
84 |
+
return ModelType.ClosedSource
|
|
|
|
|
|
|
|
|
85 |
return ModelType.Unknown
|
86 |
|
87 |
class WeightType(Enum):
|
src/leaderboard/read_evals.py
CHANGED
@@ -17,18 +17,21 @@ class EvalResult:
|
|
17 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
20 |
-
full_model: str # org/model (path on hub)
|
21 |
org: str
|
22 |
-
|
23 |
-
|
|
|
24 |
results: dict
|
|
|
|
|
|
|
|
|
25 |
precision: Precision = Precision.Unknown
|
26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
-
num_params: int = 0
|
32 |
date: str = "" # submission date of request file
|
33 |
still_on_hub: bool = False
|
34 |
|
@@ -41,30 +44,32 @@ class EvalResult:
|
|
41 |
config = data.get("config")
|
42 |
|
43 |
# Precision
|
44 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
|
46 |
# Get model and org
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
|
69 |
# Extract results available in this file (some results are split in several files)
|
70 |
results = {}
|
@@ -79,16 +84,28 @@ class EvalResult:
|
|
79 |
mean_acc = np.mean(accs) * 100.0
|
80 |
results[task.benchmark] = mean_acc
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
return self(
|
83 |
-
eval_name=
|
84 |
-
full_model=full_model,
|
85 |
org=org,
|
86 |
-
|
|
|
87 |
results=results,
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
@@ -112,18 +129,21 @@ class EvalResult:
|
|
112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
-
AutoEvalColumn.
|
121 |
-
AutoEvalColumn.
|
|
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
-
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
-
AutoEvalColumn.
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
|
|
|
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
@@ -171,12 +191,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
171 |
|
172 |
for file in files:
|
173 |
model_result_filepaths.append(os.path.join(root, file))
|
|
|
174 |
|
175 |
eval_results = {}
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
|
181 |
# Store results of same eval together
|
182 |
eval_name = eval_result.eval_name
|
@@ -191,6 +212,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
191 |
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
|
|
|
|
194 |
continue
|
195 |
|
196 |
return results
|
|
|
17 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
|
|
20 |
org: str
|
21 |
+
generation_model: str
|
22 |
+
retrieval_model: str
|
23 |
+
# revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
+
generation_model_link: str = "" # link to the model on the hub
|
26 |
+
generation_model_args: dict = None
|
27 |
+
retrieval_model_link: str = "" # link to the model on the hub
|
28 |
+
retrieval_model_args: dict = None
|
29 |
precision: Precision = Precision.Unknown
|
30 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
31 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
32 |
architecture: str = "Unknown"
|
33 |
license: str = "?"
|
34 |
likes: int = 0
|
|
|
35 |
date: str = "" # submission date of request file
|
36 |
still_on_hub: bool = False
|
37 |
|
|
|
44 |
config = data.get("config")
|
45 |
|
46 |
# Precision
|
47 |
+
# precision = Precision.from_str(config.get("model_dtype"))
|
48 |
|
49 |
# Get model and org
|
50 |
+
eval_name= config.get("eval_name", "")
|
51 |
+
generation_model = config.get("generation_model", "")
|
52 |
+
retrieval_model = config.get("retrieval_model", "")
|
53 |
+
org= config.get("org", "")
|
54 |
+
# org_and_model = org_and_model.split("/", 1)
|
55 |
+
#
|
56 |
+
# if len(org_and_model) == 1:
|
57 |
+
# org = None
|
58 |
+
# model = org_and_model[0]
|
59 |
+
# result_key = f"{model}_{precision.value.name}"
|
60 |
+
# else:
|
61 |
+
# org = org_and_model[0]
|
62 |
+
# model = org_and_model[1]
|
63 |
+
# result_key = f"{org}_{model}_{precision.value.name}"
|
64 |
+
# full_model = "/".join(org_and_model)
|
65 |
+
|
66 |
+
# still_on_hub, _, model_config = is_model_on_hub(
|
67 |
+
# full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
68 |
+
# )
|
69 |
+
# if model_config is not None:
|
70 |
+
# architectures = getattr(model_config, "architectures", None)
|
71 |
+
# if architectures:
|
72 |
+
# architecture = ";".join(architectures)
|
73 |
|
74 |
# Extract results available in this file (some results are split in several files)
|
75 |
results = {}
|
|
|
84 |
mean_acc = np.mean(accs) * 100.0
|
85 |
results[task.benchmark] = mean_acc
|
86 |
|
87 |
+
generation_model_args = config.get("generation_model_args", None)
|
88 |
+
retrieval_model_args = config.get("retrieval_model_args", None)
|
89 |
+
open_source= True
|
90 |
+
if not generation_model_args or not generation_model_args.get("open_source", False):
|
91 |
+
open_source = False
|
92 |
+
if not retrieval_model_args or not retrieval_model_args.get("open_source", False):
|
93 |
+
open_source = False
|
94 |
+
|
95 |
return self(
|
96 |
+
eval_name=eval_name,
|
97 |
+
# full_model=full_model,
|
98 |
org=org,
|
99 |
+
generation_model=generation_model,
|
100 |
+
retrieval_model=retrieval_model,
|
101 |
results=results,
|
102 |
+
generation_model_args=generation_model_args,
|
103 |
+
retrieval_model_args=retrieval_model_args,
|
104 |
+
model_type=ModelType.OpenSource if open_source else ModelType.ClosedSource,
|
105 |
+
# precision=precision,
|
106 |
+
# revision= config.get("model_sha", ""),
|
107 |
+
# still_on_hub=still_on_hub,
|
108 |
+
# architecture=architecture
|
109 |
)
|
110 |
|
111 |
def update_with_request_file(self, requests_path):
|
|
|
129 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
130 |
data_dict = {
|
131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
132 |
+
# AutoEvalColumn.precision.name: self.precision.value.name,
|
133 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
134 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
135 |
+
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
136 |
+
# AutoEvalColumn.architecture.name: self.architecture,
|
137 |
+
AutoEvalColumn.generation_model.name: make_clickable_model(self.generation_model, self.generation_model_link),
|
138 |
+
AutoEvalColumn.retrieval_model.name: make_clickable_model(self.retrieval_model, self.retrieval_model_link),
|
139 |
+
# AutoEvalColumn.revision.name: self.revision,
|
140 |
AutoEvalColumn.average.name: average,
|
141 |
+
# AutoEvalColumn.license.name: self.license,
|
142 |
+
# AutoEvalColumn.likes.name: self.likes,
|
143 |
+
# AutoEvalColumn.generation_model_params.name: self.num_params,
|
144 |
+
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
145 |
+
"Gen#Params (B)": self.generation_model_args.get("num_params", "Unknown"),
|
146 |
+
"Ret#Params (B)": self.retrieval_model_args.get("num_params", "Unknown"),
|
147 |
}
|
148 |
|
149 |
for task in Tasks:
|
|
|
191 |
|
192 |
for file in files:
|
193 |
model_result_filepaths.append(os.path.join(root, file))
|
194 |
+
print(f"Adding {file}")
|
195 |
|
196 |
eval_results = {}
|
197 |
for model_result_filepath in model_result_filepaths:
|
198 |
# Creation of result
|
199 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
200 |
+
# eval_result.update_with_request_file(requests_path)
|
201 |
|
202 |
# Store results of same eval together
|
203 |
eval_name = eval_result.eval_name
|
|
|
212 |
v.to_dict() # we test if the dict version is complete
|
213 |
results.append(v)
|
214 |
except KeyError: # not all eval values present
|
215 |
+
import traceback
|
216 |
+
traceback.print_exc()
|
217 |
continue
|
218 |
|
219 |
return results
|
src/populate.py
CHANGED
@@ -12,6 +12,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
+
print(all_data_json)
|
16 |
|
17 |
df = pd.DataFrame.from_records(all_data_json)
|
18 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|