陈俊杰 commited on
Commit
496eb7b
1 Parent(s): a2a84e8

cjj-leaderboard

Browse files
Files changed (2) hide show
  1. app.py +97 -54
  2. test.py +27 -0
app.py CHANGED
@@ -13,10 +13,9 @@ st.title("NTCIR-18 Automatic Evaluation of LLMs (AEOLLM) Task")
13
  with st.sidebar:
14
  page = option_menu(
15
  "Navigation",
16
- ["Introduction", "Methodology", "Datasets", "Important Dates",
17
- "Evaluation Measures", "Data and File format", "Submit",
18
- "LeaderBoard", "Organisers", "References"],
19
- icons=['house', 'book', 'database', 'calendar', 'clipboard', 'file', 'upload', 'trophy', 'people', 'book'],
20
  menu_icon="cast",
21
  default_index=0,
22
  styles={
@@ -199,69 +198,113 @@ This leaderboard is used to show the performance of the <strong>automatic evalua
199
  <p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
200
  </p>
201
  """, unsafe_allow_html=True)
202
- # 创建示例数据
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- # teamId 唯一标识码
205
- DG = {
206
- "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
207
- "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
208
- "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
209
- "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
210
- "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
 
 
 
 
 
 
 
 
 
211
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
- df1 = pd.DataFrame(DG)
214
 
215
- TE = {
216
- "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
217
- "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
218
- "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
219
- "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
220
- "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
221
- }
222
- df2 = pd.DataFrame(TE)
223
 
224
- SG = {
225
- "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
226
- "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
227
- "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
228
- "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
229
- "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
230
- }
231
- df3 = pd.DataFrame(SG)
232
 
233
- NFQA = {
234
- "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
235
- "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
236
- "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
237
- "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
238
- "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
239
- }
240
- df4 = pd.DataFrame(NFQA)
241
 
242
- df = [df1, df2, df3, df4]
243
- for d in df:
244
- for col in d.select_dtypes(include=['float64', 'int64']).columns:
245
- d[col] = d[col].apply(lambda x: f"{x:.4f}")
246
 
247
  # 创建标签页
248
- tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
249
 
250
- with tab1:
251
- st.markdown("""<p class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</p>""", unsafe_allow_html=True)
252
- st.dataframe(df1, use_container_width=True)
253
 
254
- with tab2:
255
- st.markdown("""<p class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</p>""", unsafe_allow_html=True)
256
- st.dataframe(df2, use_container_width=True)
257
 
258
- with tab3:
259
- st.markdown("""<p class='main-text'>Task: Summary Generation; Dataset: Xsum</p>""", unsafe_allow_html=True)
260
- st.dataframe(df3, use_container_width=True)
261
 
262
- with tab4:
263
- st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
264
- st.dataframe(df4, use_container_width=True)
265
  st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
266
  # 获取北京时间
267
  time_placeholder = st.empty()
 
13
  with st.sidebar:
14
  page = option_menu(
15
  "Navigation",
16
+ ["LeaderBoard", "Introduction", "Methodology", "Datasets", "Important Dates",
17
+ "Evaluation Measures", "Data and File format", "Submit", "Organisers", "References"],
18
+ icons=['trophy', 'house', 'book', 'database', 'calendar', 'clipboard', 'file', 'upload', 'people', 'book'],
 
19
  menu_icon="cast",
20
  default_index=0,
21
  styles={
 
198
  <p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
199
  </p>
200
  """, unsafe_allow_html=True)
201
+ index = pd.MultiIndex.from_tuples([
202
+ ('', 'teamId'),
203
+ ('', 'methods'),
204
+ ('', 'overall'),
205
+ ('Dialogue Generation', 'accuracy'),
206
+ ('Dialogue Generation', "kendall's tau"),
207
+ ('Dialogue Generation', 'spearman'),
208
+ ('Text Expansion', "accuracy"),
209
+ ('Text Expansion', "kendall's tau"),
210
+ ('Text Expansion', 'spearman'),
211
+ ('Summary Generation', 'accuracy'),
212
+ ('Summary Generation', "kendall's tau"),
213
+ ('Summary Generation', 'spearman'),
214
+ ('Non-Factoid QA', "accuracy"),
215
+ ('Non-Factoid QA', "kendall's tau"),
216
+ ('Non-Factoid QA', 'spearman')
217
+ ])
218
 
219
+ data = {
220
+ ('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
221
+ ('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
222
+ ('', 'overall'): [],
223
+ ('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
224
+ ('Dialogue Generation', "kendall's tau"): [0.3243, 0.1739, 0.3042, 0.4167],
225
+ ('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
226
+ ('Text Expansion', "accuracy"): [0.5107, 0.5050, 0.5461, 0.5581],
227
+ ('Text Expansion', "kendall's tau"): [0.1281, 0.0635, 0.2716, 0.3864],
228
+ ('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
229
+ ('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
230
+ ('Summary Generation', "kendall's tau"): [0.3957, 0.2688, 0.5092, 0.5001],
231
+ ('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
232
+ ('Non-Factoid QA', "accuracy"): [0.5935, 0.5817, 0.7000, 0.7203],
233
+ ('Non-Factoid QA', "kendall's tau"): [0.2332, 0.2389, 0.4440, 0.4235],
234
+ ('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
235
  }
236
+ overall = [0, 0, 0, 0]
237
+ for d in data:
238
+ if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
239
+ for i in range(4):
240
+ overall[i] += data[d][i]
241
+ overall = [i / (3*4) for i in overall]
242
+ data[('', 'overall')] = overall
243
+ for d in data:
244
+ for col in d.select_dtypes(include=['float64', 'int64']).columns:
245
+ d[col] = d[col].apply(lambda x: f"{x:.4f}")
246
+ st.dataframe(data, use_container_width=True)
247
+ # # teamId 唯一标识码
248
+ # DG = {
249
+ # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
250
+ # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
251
+ # "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
252
+ # "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
253
+ # "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
254
+ # }
255
 
256
+ # df1 = pd.DataFrame(DG)
257
 
258
+ # TE = {
259
+ # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
260
+ # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
261
+ # "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
262
+ # "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
263
+ # "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
264
+ # }
265
+ # df2 = pd.DataFrame(TE)
266
 
267
+ # SG = {
268
+ # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
269
+ # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
270
+ # "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
271
+ # "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
272
+ # "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
273
+ # }
274
+ # df3 = pd.DataFrame(SG)
275
 
276
+ # NFQA = {
277
+ # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
278
+ # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
279
+ # "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
280
+ # "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
281
+ # "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
282
+ # }
283
+ # df4 = pd.DataFrame(NFQA)
284
 
285
+ # df = [df1, df2, df3, df4]
286
+ # for d in df:
287
+ # for col in d.select_dtypes(include=['float64', 'int64']).columns:
288
+ # d[col] = d[col].apply(lambda x: f"{x:.4f}")
289
 
290
  # 创建标签页
291
+ # tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
292
 
293
+ # with tab1:
294
+ # st.markdown("""<p class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</p>""", unsafe_allow_html=True)
295
+ # st.dataframe(df1, use_container_width=True)
296
 
297
+ # with tab2:
298
+ # st.markdown("""<p class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</p>""", unsafe_allow_html=True)
299
+ # st.dataframe(df2, use_container_width=True)
300
 
301
+ # with tab3:
302
+ # st.markdown("""<p class='main-text'>Task: Summary Generation; Dataset: Xsum</p>""", unsafe_allow_html=True)
303
+ # st.dataframe(df3, use_container_width=True)
304
 
305
+ # with tab4:
306
+ # st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
307
+ # st.dataframe(df4, use_container_width=True)
308
  st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
309
  # 获取北京时间
310
  time_placeholder = st.empty()
test.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data = {
2
+ ('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
3
+ ('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
4
+ ('', 'overall'): [],
5
+ ('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
6
+ ('Dialogue Generation', "kendall's tau"): [0.3243, 0.1739, 0.3042, 0.4167],
7
+ ('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
8
+ ('Text Expansion', "accuracy"): [0.5107, 0.5050, 0.5461, 0.5581],
9
+ ('Text Expansion', "kendall's tau"): [0.1281, 0.0635, 0.2716, 0.3864],
10
+ ('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
11
+ ('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
12
+ ('Summary Generation', "kendall's tau"): [0.3957, 0.2688, 0.5092, 0.5001],
13
+ ('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
14
+ ('Non-Factoid QA', "accuracy"): [0.5935, 0.5817, 0.7000, 0.7203],
15
+ ('Non-Factoid QA', "kendall's tau"): [0.2332, 0.2389, 0.4440, 0.4235],
16
+ ('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
17
+ }
18
+
19
+ overall = [0, 0, 0, 0]
20
+
21
+ for d in data:
22
+ if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
23
+ for i in range(4):
24
+ overall[i] += data[d][i]
25
+
26
+ overall = [i / (3*4) for i in overall]
27
+ print(overall)