zstanjj commited on
Commit
67ce912
1 Parent(s): c881658

add open-source

Browse files
Files changed (21) hide show
  1. .gitignore +0 -1
  2. app.py +107 -109
  3. eval-results/.gitattributes +55 -0
  4. eval-results/demo-leaderboard/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +34 -0
  5. eval-results/demo-leaderboard/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +34 -0
  6. eval-results/demo-leaderboard/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +34 -0
  7. eval-results/demo-leaderboard/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +34 -0
  8. eval-results/demo-leaderboard/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +35 -0
  9. eval-results/demo-leaderboard/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +35 -0
  10. eval-results/demo-leaderboard/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
  11. eval-results/demo-leaderboard/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +35 -0
  12. eval-results/demo-leaderboard/qwen2-72b_bge-large-zh/results_2023-12-08 15:46:20.425378.json +35 -0
  13. eval-results/demo-leaderboard/qwen2-72b_bge-m3/results_2023-12-08 15:46:20.425378.json +35 -0
  14. eval-results/demo-leaderboard/qwen2-72b_e5-mistral-7b/results_2023-12-08 15:46:20.425378.json +35 -0
  15. eval-results/demo-leaderboard/qwen2-72b_gte-qwen2-1.5b/results_2023-12-08 15:46:20.425378.json +35 -0
  16. eval-results/demo-leaderboard/qwen2-72b_jina-zh/results_2023-12-08 15:46:20.425378.json +35 -0
  17. src/about.py +21 -3
  18. src/display/formatting.py +3 -1
  19. src/display/utils.py +33 -32
  20. src/leaderboard/read_evals.py +66 -43
  21. src/populate.py +1 -0
.gitignore CHANGED
@@ -7,7 +7,6 @@ __pycache__/
7
  .vscode/
8
 
9
  eval-queue/
10
- eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
 
7
  .vscode/
8
 
9
  eval-queue/
 
10
  eval-queue-bk/
11
  eval-results-bk/
12
  logs/
app.py CHANGED
@@ -35,27 +35,27 @@ def restart_space():
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
  except Exception:
49
  restart_space()
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
@@ -68,21 +68,21 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -101,92 +101,90 @@ with demo:
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
+ # snapshot_download(
39
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
+ # )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
+ # snapshot_download(
46
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
+ # )
48
  except Exception:
49
  restart_space()
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
+ # (
55
+ # finished_eval_queue_df,
56
+ # running_eval_queue_df,
57
+ # pending_eval_queue_df,
58
+ # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
 
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
+ search_columns=[AutoEvalColumn.generation_model.name, AutoEvalColumn.retrieval_model.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ # ColumnFilter(
77
+ # AutoEvalColumn.params.name,
78
+ # type="slider",
79
+ # min=0.01,
80
+ # max=150,
81
+ # label="Select the number of parameters (B)",
82
+ # ),
83
+ # ColumnFilter(
84
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ # ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
 
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
+ # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
+ # with gr.Column():
106
+ # with gr.Row():
107
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
+ #
109
+ # with gr.Column():
110
+ # with gr.Accordion(
111
+ # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
+ # open=False,
113
+ # ):
114
+ # with gr.Row():
115
+ # finished_eval_table = gr.components.Dataframe(
116
+ # value=finished_eval_queue_df,
117
+ # headers=EVAL_COLS,
118
+ # datatype=EVAL_TYPES,
119
+ # row_count=5,
120
+ # )
121
+ # with gr.Accordion(
122
+ # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
+ # open=False,
124
+ # ):
125
+ # with gr.Row():
126
+ # running_eval_table = gr.components.Dataframe(
127
+ # value=running_eval_queue_df,
128
+ # headers=EVAL_COLS,
129
+ # datatype=EVAL_TYPES,
130
+ # row_count=5,
131
+ # )
132
+ #
133
+ # with gr.Accordion(
134
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
+ # open=False,
136
+ # ):
137
+ # with gr.Row():
138
+ # pending_eval_table = gr.components.Dataframe(
139
+ # value=pending_eval_queue_df,
140
+ # headers=EVAL_COLS,
141
+ # datatype=EVAL_TYPES,
142
+ # row_count=5,
143
+ # )
144
+ # with gr.Row():
145
+ # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
+ #
147
+ # with gr.Row():
148
+ # with gr.Column():
149
+ # model_name_textbox = gr.Textbox(label="Model name")
150
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
+ # model_type = gr.Dropdown(
152
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
+ # label="Model type",
154
+ # multiselect=False,
155
+ # value=None,
156
+ # interactive=True,
157
+ # )
158
+ #
159
+ # with gr.Column():
160
+ # precision = gr.Dropdown(
161
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
+ # label="Precision",
163
+ # multiselect=False,
164
+ # value="float16",
165
+ # interactive=True,
166
+ # )
167
+ # weight_type = gr.Dropdown(
168
+ # choices=[i.value.name for i in WeightType],
169
+ # label="Weights type",
170
+ # multiselect=False,
171
+ # value="Original",
172
+ # interactive=True,
173
+ # )
174
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
+ # eval_result=gr.Textbox(label="Eval Result")
176
+ #
177
+ # submit_button = gr.Button("Submit Eval")
178
+ # submission_result = gr.Markdown()
179
+ # submit_button.click(
180
+ # add_new_eval,
181
+ # [
182
+ # model_name_textbox,
183
+ # revision_name_textbox,
184
+ # model_type,
185
+ # ],
186
+ # submission_result,
187
+ # )
 
 
188
 
189
  with gr.Row():
190
  with gr.Accordion("📙 Citation", open=False):
eval-results/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-results/demo-leaderboard/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.0,
5
+ "map": 0.0
6
+ },
7
+ "generation": {
8
+ "em": 0.002277904328018223,
9
+ "f1": 0.43525331778147214,
10
+ "rouge1": 0.3150681120081669,
11
+ "rouge2": 0.12933954114035873,
12
+ "rougeL": 0.22495384062408755,
13
+ "accuracy": 0.33058086560364464,
14
+ "completeness": 0.5540647198105761,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.11534391534391535
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "CLOSE_deepseek-v2-chat",
22
+ "generation_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
23
+ "generation_model_args": {
24
+ "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
25
+ "num_params": 80,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "CLOSE",
29
+ "retrieval_model_args": {
30
+ "num_params": 0.0,
31
+ "open_source": false
32
+ }
33
+ }
34
+ }
eval-results/demo-leaderboard/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.0,
5
+ "map": 0.0
6
+ },
7
+ "generation": {
8
+ "em": 0.0005694760820045558,
9
+ "f1": 0.418257314239393,
10
+ "rouge1": 0.3061411048446855,
11
+ "rouge2": 0.12053616693649026,
12
+ "rougeL": 0.21948810430155005,
13
+ "accuracy": 0.285876993166287,
14
+ "completeness": 0.5132605304212169,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.06589958158995816
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "CLOSE_llama3-70b-instruct",
22
+ "generation_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
23
+ "generation_model_args": {
24
+ "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
25
+ "num_params": 70,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "CLOSE",
29
+ "retrieval_model_args": {
30
+ "num_params": 0.0,
31
+ "open_source": false
32
+ }
33
+ }
34
+ }
eval-results/demo-leaderboard/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.0,
5
+ "map": 0.0
6
+ },
7
+ "generation": {
8
+ "em": 0.0005694760820045558,
9
+ "f1": 0.4191216882279184,
10
+ "rouge1": 0.2989940495432677,
11
+ "rouge2": 0.12047626678426614,
12
+ "rougeL": 0.2082230205185154,
13
+ "accuracy": 0.34054669703872437,
14
+ "completeness": 0.5753690753690753,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.12406417112299466
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "CLOSE_qwen2-72b",
22
+ "generation_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generation_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "CLOSE",
29
+ "retrieval_model_args": {
30
+ "num_params": 0.0,
31
+ "open_source": false
32
+ }
33
+ }
34
+ }
eval-results/demo-leaderboard/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.0,
5
+ "map": 0.0
6
+ },
7
+ "generation": {
8
+ "em": 0.0,
9
+ "f1": 0.11613328937628616,
10
+ "rouge1": 0.15613267640197273,
11
+ "rouge2": 0.04591153663411247,
12
+ "rougeL": 0.0496843687172552,
13
+ "accuracy": 0.14607061503416857,
14
+ "completeness": 0.4987157534246575,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.0748663101604278
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "CLOSE_yi15-34b",
22
+ "generation_model": "01ai/Yi-1.5-34B-Chat-16K",
23
+ "generation_model_args": {
24
+ "name": "01ai/Yi-1.5-34B-Chat-16K",
25
+ "num_params": 34,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "CLOSE",
29
+ "retrieval_model_args": {
30
+ "num_params": 0.0,
31
+ "open_source": false
32
+ }
33
+ }
34
+ }
eval-results/demo-leaderboard/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.36173120728929387,
5
+ "map": 0.3512338648443432
6
+ },
7
+ "generation": {
8
+ "em": 0.0056947608200455585,
9
+ "f1": 0.4212862409737785,
10
+ "rouge1": 0.3707328288930376,
11
+ "rouge2": 0.21393113234607009,
12
+ "rougeL": 0.2719847145278759,
13
+ "accuracy": 0.3886674259681093,
14
+ "completeness": 0.5858823529411765,
15
+ "hallucination": 0.07893209518282066,
16
+ "utilization": 0.48166472642607683,
17
+ "numerical_accuracy": 0.27365491651205937
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
22
+ "generation_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
23
+ "generation_model_args": {
24
+ "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
25
+ "num_params": 80,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.5,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/demo-leaderboard/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.36173120728929387,
5
+ "map": 0.3512338648443432
6
+ },
7
+ "generation": {
8
+ "em": 0.04555808656036447,
9
+ "f1": 0.4907954247383474,
10
+ "rouge1": 0.4080491070348775,
11
+ "rouge2": 0.23130474174425783,
12
+ "rougeL": 0.3217574785678875,
13
+ "accuracy": 0.4216970387243736,
14
+ "completeness": 0.5688146380270486,
15
+ "hallucination": 0.11832946635730858,
16
+ "utilization": 0.4491869918699187,
17
+ "numerical_accuracy": 0.288981288981289
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
22
+ "generation_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
23
+ "generation_model_args": {
24
+ "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
25
+ "num_params": 70,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.5,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/demo-leaderboard/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.36173120728929387,
5
+ "map": 0.3512338648443432
6
+ },
7
+ "generation": {
8
+ "em": 0.002277904328018223,
9
+ "f1": 0.3804001391052641,
10
+ "rouge1": 0.34576336184459094,
11
+ "rouge2": 0.1928778762677512,
12
+ "rougeL": 0.2383694455084706,
13
+ "accuracy": 0.4145785876993166,
14
+ "completeness": 0.598297213622291,
15
+ "hallucination": 0.07213496218731821,
16
+ "utilization": 1.13922942206655,
17
+ "numerical_accuracy": 0.3218694885361552
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "gte-qwen2-1.5b_qwen2-72b",
22
+ "generation_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generation_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.5,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/demo-leaderboard/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.36173120728929387,
5
+ "map": 0.3512338648443432
6
+ },
7
+ "generation": {
8
+ "em": 0.0,
9
+ "f1": 0.16041349053275844,
10
+ "rouge1": 0.21775697114621573,
11
+ "rouge2": 0.09738983880706074,
12
+ "rougeL": 0.08775246194460379,
13
+ "accuracy": 0.3211845102505695,
14
+ "completeness": 0.5703789636504254,
15
+ "hallucination": 0.07665094339622641,
16
+ "utilization": 0.40828402366863903,
17
+ "numerical_accuracy": 0.162
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "gte-qwen2-1.5b_yi15-34b",
22
+ "generation_model": "01ai/Yi-1.5-34B-Chat-16K",
23
+ "generation_model_args": {
24
+ "name": "01ai/Yi-1.5-34B-Chat-16K",
25
+ "num_params": 34,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.5,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/demo-leaderboard/qwen2-72b_bge-large-zh/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.3426063022019742,
5
+ "map": 0.33500379650721335
6
+ },
7
+ "generation": {
8
+ "em": 0.0017084282460136675,
9
+ "f1": 0.3797528411547138,
10
+ "rouge1": 0.3372893350582966,
11
+ "rouge2": 0.18329984910669803,
12
+ "rougeL": 0.23230144566069125,
13
+ "accuracy": 0.40888382687927105,
14
+ "completeness": 0.6021044427123928,
15
+ "hallucination": 0.0023391812865497076,
16
+ "utilization": 0.5014637002341921,
17
+ "numerical_accuracy": 0.3100358422939068
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "bge-large-zh_bge-large-zh",
22
+ "generation_model": "BAAI/bge-large-zh",
23
+ "generation_model_args": {
24
+ "name": "BAAI/bge-large-zh",
25
+ "num_params": 0.2,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "BAAI/bge-large-zh",
29
+ "retrieval_model_args": {
30
+ "name": "BAAI/bge-large-zh",
31
+ "num_params": 0.2,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/demo-leaderboard/qwen2-72b_bge-m3/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.3527809415337889,
5
+ "map": 0.3458855353075171
6
+ },
7
+ "generation": {
8
+ "em": 0.0017084282460136675,
9
+ "f1": 0.38645032979631466,
10
+ "rouge1": 0.3467267951634575,
11
+ "rouge2": 0.1930581604826183,
12
+ "rougeL": 0.24141093461883717,
13
+ "accuracy": 0.4271070615034169,
14
+ "completeness": 0.6119287374128582,
15
+ "hallucination": 0.0005847953216374269,
16
+ "utilization": 0.5400116822429907,
17
+ "numerical_accuracy": 0.3372093023255814
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "bge-m3_bge-m3",
22
+ "generation_model": "BAAI/bge-m3",
23
+ "generation_model_args": {
24
+ "name": "BAAI/bge-m3",
25
+ "num_params": 0.2,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "BAAI/bge-m3",
29
+ "retrieval_model_args": {
30
+ "name": "BAAI/bge-m3",
31
+ "num_params": 0.2,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/demo-leaderboard/qwen2-72b_e5-mistral-7b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.303246013667426,
5
+ "map": 0.2960516324981017
6
+ },
7
+ "generation": {
8
+ "em": 0.002277904328018223,
9
+ "f1": 0.3705164550873997,
10
+ "rouge1": 0.3270311806826159,
11
+ "rouge2": 0.17476659877087528,
12
+ "rougeL": 0.22225645997479143,
13
+ "accuracy": 0.385250569476082,
14
+ "completeness": 0.5877535101404057,
15
+ "hallucination": 1.2922719349215572,
16
+ "utilization": 0.4793244030285381,
17
+ "numerical_accuracy": 0.28622540250447226
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "e5-mistral-7b_e5-mistral-7b",
22
+ "generation_model": "intfloat/e5-mistral-7b-instruct",
23
+ "generation_model_args": {
24
+ "name": "intfloat/e5-mistral-7b-instruct",
25
+ "num_params": 7,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "intfloat/e5-mistral-7b-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "intfloat/e5-mistral-7b-instruct",
31
+ "num_params": 7,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/demo-leaderboard/qwen2-72b_gte-qwen2-1.5b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.36173120728929387,
5
+ "map": 0.3512338648443432
6
+ },
7
+ "generation": {
8
+ "em": 0.002277904328018223,
9
+ "f1": 0.3804001391052641,
10
+ "rouge1": 0.34576336184459094,
11
+ "rouge2": 0.1928778762677512,
12
+ "rougeL": 0.2383694455084706,
13
+ "accuracy": 0.4145785876993166,
14
+ "completeness": 0.598297213622291,
15
+ "hallucination": 0.0011627906976744186,
16
+ "utilization": 1.13922942206655,
17
+ "numerical_accuracy": 0.3218694885361552
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "gte-qwen2-1.5b_gte-qwen2-1.5b",
22
+ "generation_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
23
+ "generation_model_args": {
24
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
25
+ "num_params": 1.5,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.5,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/demo-leaderboard/qwen2-72b_jina-zh/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.27484813971146543,
5
+ "map": 0.26924354593773725
6
+ },
7
+ "generation": {
8
+ "em": 0.003416856492027335,
9
+ "f1": 0.37960439080933656,
10
+ "rouge1": 0.3255380867320351,
11
+ "rouge2": 0.1732248556904568,
12
+ "rougeL": 0.22591939162851002,
13
+ "accuracy": 0.3826879271070615,
14
+ "completeness": 0.5793588741204065,
15
+ "hallucination": 0.0017381228273464658,
16
+ "utilization": 0.4855072463768116,
17
+ "numerical_accuracy": 0.2663594470046083
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "jina-zh_jina-zh",
22
+ "generation_model": "jinaai/reader-lm-0.5b",
23
+ "generation_model_args": {
24
+ "name": "jinaai/reader-lm-0.5b",
25
+ "num_params": 0.2,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "jinaai/reader-lm-0.5b",
29
+ "retrieval_model_args": {
30
+ "name": "jinaai/reader-lm-0.5b",
31
+ "num_params": 0.2,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
src/about.py CHANGED
@@ -12,8 +12,26 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,7 +39,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ # task0 = Task("anli_r1", "acc", "ANLI")
16
+ # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
+
18
+ # retrieval tasks
19
+ mrr = Task("retrieval", "mrr", "MRR")
20
+ map = Task("retrieval", "map", "MAP")
21
+
22
+ # generation tasks
23
+ em = Task("generation", "em", "EM")
24
+ f1 = Task("generation", "f1", "F1")
25
+ rouge1 = Task("generation", "rouge1", "Rouge-1")
26
+ rouge2 = Task("generation", "rouge2", "Rouge-2")
27
+ rougeL = Task("generation", "rougeL", "Rouge-L")
28
+
29
+ accuracy = Task("generation", "accuracy", "ACC")
30
+ completeness = Task("generation", "completeness", "COMP")
31
+ hallucination = Task("generation", "hallucination", "HAL")
32
+ utilization = Task("generation", "utilization", "UTIL")
33
+ numerical_accuracy = Task("generation", "numerical_accuracy", "MACC")
34
+
35
 
36
  NUM_FEWSHOT = 0 # Change with your few shot
37
  # ---------------------------------------------------
 
39
 
40
 
41
  # Your leaderboard name
42
+ TITLE = """<h1 align="center" id="space-title">Fin Benchmark leaderboard</h1>"""
43
 
44
  # What does your leaderboard evaluate?
45
  INTRODUCTION_TEXT = """
src/display/formatting.py CHANGED
@@ -2,7 +2,9 @@ def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
- def make_clickable_model(model_name):
 
 
6
  link = f"https://huggingface.co/{model_name}"
7
  return model_hyperlink(link, model_name)
8
 
 
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
+ def make_clickable_model(model_name, model_link=None):
6
+ if model_link:
7
+ return model_hyperlink(model_link, model_name)
8
  link = f"https://huggingface.co/{model_name}"
9
  return model_hyperlink(link, model_name)
10
 
src/display/utils.py CHANGED
@@ -1,8 +1,6 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
- import pandas as pd
5
-
6
  from src.about import Tasks
7
 
8
  def fields(raw_class):
@@ -23,35 +21,42 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
 
 
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
45
 
46
  ## For the queue columns in the submission tab
47
- @dataclass(frozen=True)
48
- class EvalQueueColumn: # Queue column
49
- model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
- status = ColumnContent("status", "str", True)
 
55
 
56
  ## All the model information that we might need
57
  @dataclass
@@ -62,10 +67,10 @@ class ModelDetails:
62
 
63
 
64
  class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -73,14 +78,10 @@ class ModelType(Enum):
73
 
74
  @staticmethod
75
  def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "⭕" in type:
83
- return ModelType.IFT
84
  return ModelType.Unknown
85
 
86
  class WeightType(Enum):
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
 
 
4
  from src.about import Tasks
5
 
6
  def fields(raw_class):
 
21
  ## Leaderboard columns
22
  auto_eval_column_dict = []
23
  # Init
24
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("Type Symbol", "str", True, never_hidden=True)])
25
+ # auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
26
+ auto_eval_column_dict.append(["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["generation_model", ColumnContent, ColumnContent("Generation Model", "markdown", True, never_hidden=True)])
28
+
29
  #Scores
30
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
+
34
  # Model information
35
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
37
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
38
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
39
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
40
+ auto_eval_column_dict.append(["gen_num_params", ColumnContent, ColumnContent("Gen#Params (B)", "number", False)])
41
+ auto_eval_column_dict.append(["ret_num_params", ColumnContent, ColumnContent("Ret#Params (B)", "number", False)])
42
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
43
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
44
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
45
 
46
  # We use make dataclass to dynamically fill the scores from Tasks
47
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
48
+ EvalQueueColumn = make_dataclass("EvalQueueColumn", auto_eval_column_dict, frozen=True)
49
 
50
  ## For the queue columns in the submission tab
51
+ # @dataclass(frozen=True)
52
+ # class EvalQueueColumn: # Queue column
53
+ # model = ColumnContent("model", "markdown", True)
54
+ # revision = ColumnContent("revision", "str", True)
55
+ # private = ColumnContent("private", "bool", True)
56
+ # precision = ColumnContent("precision", "str", True)
57
+ # weight_type = ColumnContent("weight_type", "str", "Original")
58
+ # status = ColumnContent("status", "str", True)
59
+
60
 
61
  ## All the model information that we might need
62
  @dataclass
 
67
 
68
 
69
  class ModelType(Enum):
70
+ OpenSource = ModelDetails(name="open-source", symbol="🟢")
71
+ # FT = ModelDetails(name="fine-tuned", symbol="🔶")
72
+ ClosedSource = ModelDetails(name="closed-source", symbol="⭕")
73
+ # RL = ModelDetails(name="RL-tuned", symbol="🟦")
74
  Unknown = ModelDetails(name="", symbol="?")
75
 
76
  def to_str(self, separator=" "):
 
78
 
79
  @staticmethod
80
  def from_str(type):
81
+ if "open-source" in type or "🟢" in type:
82
+ return ModelType.OpenSource
83
+ if "closed-source" in type or "" in type:
84
+ return ModelType.ClosedSource
 
 
 
 
85
  return ModelType.Unknown
86
 
87
  class WeightType(Enum):
src/leaderboard/read_evals.py CHANGED
@@ -17,18 +17,21 @@ class EvalResult:
17
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
  """
19
  eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
  org: str
22
- model: str
23
- revision: str # commit hash, "" if main
 
24
  results: dict
 
 
 
 
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
28
  architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
- num_params: int = 0
32
  date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
@@ -41,30 +44,32 @@ class EvalResult:
41
  config = data.get("config")
42
 
43
  # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
 
46
  # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
 
 
68
 
69
  # Extract results available in this file (some results are split in several files)
70
  results = {}
@@ -79,16 +84,28 @@ class EvalResult:
79
  mean_acc = np.mean(accs) * 100.0
80
  results[task.benchmark] = mean_acc
81
 
 
 
 
 
 
 
 
 
82
  return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
  org=org,
86
- model=model,
 
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
 
 
 
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
@@ -112,18 +129,21 @@ class EvalResult:
112
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
  AutoEvalColumn.model_type.name: self.model_type.value.name,
117
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
 
122
  AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
 
127
  }
128
 
129
  for task in Tasks:
@@ -171,12 +191,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
171
 
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
 
174
 
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
@@ -191,6 +212,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
191
  v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
 
 
194
  continue
195
 
196
  return results
 
17
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
  """
19
  eval_name: str # org_model_precision (uid)
 
20
  org: str
21
+ generation_model: str
22
+ retrieval_model: str
23
+ # revision: str # commit hash, "" if main
24
  results: dict
25
+ generation_model_link: str = "" # link to the model on the hub
26
+ generation_model_args: dict = None
27
+ retrieval_model_link: str = "" # link to the model on the hub
28
+ retrieval_model_args: dict = None
29
  precision: Precision = Precision.Unknown
30
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
31
  weight_type: WeightType = WeightType.Original # Original or Adapter
32
  architecture: str = "Unknown"
33
  license: str = "?"
34
  likes: int = 0
 
35
  date: str = "" # submission date of request file
36
  still_on_hub: bool = False
37
 
 
44
  config = data.get("config")
45
 
46
  # Precision
47
+ # precision = Precision.from_str(config.get("model_dtype"))
48
 
49
  # Get model and org
50
+ eval_name= config.get("eval_name", "")
51
+ generation_model = config.get("generation_model", "")
52
+ retrieval_model = config.get("retrieval_model", "")
53
+ org= config.get("org", "")
54
+ # org_and_model = org_and_model.split("/", 1)
55
+ #
56
+ # if len(org_and_model) == 1:
57
+ # org = None
58
+ # model = org_and_model[0]
59
+ # result_key = f"{model}_{precision.value.name}"
60
+ # else:
61
+ # org = org_and_model[0]
62
+ # model = org_and_model[1]
63
+ # result_key = f"{org}_{model}_{precision.value.name}"
64
+ # full_model = "/".join(org_and_model)
65
+
66
+ # still_on_hub, _, model_config = is_model_on_hub(
67
+ # full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
68
+ # )
69
+ # if model_config is not None:
70
+ # architectures = getattr(model_config, "architectures", None)
71
+ # if architectures:
72
+ # architecture = ";".join(architectures)
73
 
74
  # Extract results available in this file (some results are split in several files)
75
  results = {}
 
84
  mean_acc = np.mean(accs) * 100.0
85
  results[task.benchmark] = mean_acc
86
 
87
+ generation_model_args = config.get("generation_model_args", None)
88
+ retrieval_model_args = config.get("retrieval_model_args", None)
89
+ open_source= True
90
+ if not generation_model_args or not generation_model_args.get("open_source", False):
91
+ open_source = False
92
+ if not retrieval_model_args or not retrieval_model_args.get("open_source", False):
93
+ open_source = False
94
+
95
  return self(
96
+ eval_name=eval_name,
97
+ # full_model=full_model,
98
  org=org,
99
+ generation_model=generation_model,
100
+ retrieval_model=retrieval_model,
101
  results=results,
102
+ generation_model_args=generation_model_args,
103
+ retrieval_model_args=retrieval_model_args,
104
+ model_type=ModelType.OpenSource if open_source else ModelType.ClosedSource,
105
+ # precision=precision,
106
+ # revision= config.get("model_sha", ""),
107
+ # still_on_hub=still_on_hub,
108
+ # architecture=architecture
109
  )
110
 
111
  def update_with_request_file(self, requests_path):
 
129
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
130
  data_dict = {
131
  "eval_name": self.eval_name, # not a column, just a save name,
132
+ # AutoEvalColumn.precision.name: self.precision.value.name,
133
  AutoEvalColumn.model_type.name: self.model_type.value.name,
134
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
135
+ # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
136
+ # AutoEvalColumn.architecture.name: self.architecture,
137
+ AutoEvalColumn.generation_model.name: make_clickable_model(self.generation_model, self.generation_model_link),
138
+ AutoEvalColumn.retrieval_model.name: make_clickable_model(self.retrieval_model, self.retrieval_model_link),
139
+ # AutoEvalColumn.revision.name: self.revision,
140
  AutoEvalColumn.average.name: average,
141
+ # AutoEvalColumn.license.name: self.license,
142
+ # AutoEvalColumn.likes.name: self.likes,
143
+ # AutoEvalColumn.generation_model_params.name: self.num_params,
144
+ # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
145
+ "Gen#Params (B)": self.generation_model_args.get("num_params", "Unknown"),
146
+ "Ret#Params (B)": self.retrieval_model_args.get("num_params", "Unknown"),
147
  }
148
 
149
  for task in Tasks:
 
191
 
192
  for file in files:
193
  model_result_filepaths.append(os.path.join(root, file))
194
+ print(f"Adding {file}")
195
 
196
  eval_results = {}
197
  for model_result_filepath in model_result_filepaths:
198
  # Creation of result
199
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
200
+ # eval_result.update_with_request_file(requests_path)
201
 
202
  # Store results of same eval together
203
  eval_name = eval_result.eval_name
 
212
  v.to_dict() # we test if the dict version is complete
213
  results.append(v)
214
  except KeyError: # not all eval values present
215
+ import traceback
216
+ traceback.print_exc()
217
  continue
218
 
219
  return results
src/populate.py CHANGED
@@ -12,6 +12,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
+ print(all_data_json)
16
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)