陈俊杰
commited on
Commit
•
496eb7b
1
Parent(s):
a2a84e8
cjj-leaderboard
Browse files
app.py
CHANGED
@@ -13,10 +13,9 @@ st.title("NTCIR-18 Automatic Evaluation of LLMs (AEOLLM) Task")
|
|
13 |
with st.sidebar:
|
14 |
page = option_menu(
|
15 |
"Navigation",
|
16 |
-
["Introduction", "Methodology", "Datasets", "Important Dates",
|
17 |
-
"Evaluation Measures", "Data and File format", "Submit",
|
18 |
-
|
19 |
-
icons=['house', 'book', 'database', 'calendar', 'clipboard', 'file', 'upload', 'trophy', 'people', 'book'],
|
20 |
menu_icon="cast",
|
21 |
default_index=0,
|
22 |
styles={
|
@@ -199,69 +198,113 @@ This leaderboard is used to show the performance of the <strong>automatic evalua
|
|
199 |
<p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
|
200 |
</p>
|
201 |
""", unsafe_allow_html=True)
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
"kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
-
df1 = pd.DataFrame(DG)
|
214 |
|
215 |
-
TE = {
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
}
|
222 |
-
df2 = pd.DataFrame(TE)
|
223 |
|
224 |
-
SG = {
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
}
|
231 |
-
df3 = pd.DataFrame(SG)
|
232 |
|
233 |
-
NFQA = {
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
}
|
240 |
-
df4 = pd.DataFrame(NFQA)
|
241 |
|
242 |
-
df = [df1, df2, df3, df4]
|
243 |
-
for d in df:
|
244 |
-
|
245 |
-
|
246 |
|
247 |
# 创建标签页
|
248 |
-
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
249 |
|
250 |
-
with tab1:
|
251 |
-
|
252 |
-
|
253 |
|
254 |
-
with tab2:
|
255 |
-
|
256 |
-
|
257 |
|
258 |
-
with tab3:
|
259 |
-
|
260 |
-
|
261 |
|
262 |
-
with tab4:
|
263 |
-
|
264 |
-
|
265 |
st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
|
266 |
# 获取北京时间
|
267 |
time_placeholder = st.empty()
|
|
|
13 |
with st.sidebar:
|
14 |
page = option_menu(
|
15 |
"Navigation",
|
16 |
+
["LeaderBoard", "Introduction", "Methodology", "Datasets", "Important Dates",
|
17 |
+
"Evaluation Measures", "Data and File format", "Submit", "Organisers", "References"],
|
18 |
+
icons=['trophy', 'house', 'book', 'database', 'calendar', 'clipboard', 'file', 'upload', 'people', 'book'],
|
|
|
19 |
menu_icon="cast",
|
20 |
default_index=0,
|
21 |
styles={
|
|
|
198 |
<p>The Leaderboard will be updated daily around 24:00 Beijing Time.</p>
|
199 |
</p>
|
200 |
""", unsafe_allow_html=True)
|
201 |
+
index = pd.MultiIndex.from_tuples([
|
202 |
+
('', 'teamId'),
|
203 |
+
('', 'methods'),
|
204 |
+
('', 'overall'),
|
205 |
+
('Dialogue Generation', 'accuracy'),
|
206 |
+
('Dialogue Generation', "kendall's tau"),
|
207 |
+
('Dialogue Generation', 'spearman'),
|
208 |
+
('Text Expansion', "accuracy"),
|
209 |
+
('Text Expansion', "kendall's tau"),
|
210 |
+
('Text Expansion', 'spearman'),
|
211 |
+
('Summary Generation', 'accuracy'),
|
212 |
+
('Summary Generation', "kendall's tau"),
|
213 |
+
('Summary Generation', 'spearman'),
|
214 |
+
('Non-Factoid QA', "accuracy"),
|
215 |
+
('Non-Factoid QA', "kendall's tau"),
|
216 |
+
('Non-Factoid QA', 'spearman')
|
217 |
+
])
|
218 |
|
219 |
+
data = {
|
220 |
+
('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
|
221 |
+
('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
222 |
+
('', 'overall'): [],
|
223 |
+
('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
|
224 |
+
('Dialogue Generation', "kendall's tau"): [0.3243, 0.1739, 0.3042, 0.4167],
|
225 |
+
('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
|
226 |
+
('Text Expansion', "accuracy"): [0.5107, 0.5050, 0.5461, 0.5581],
|
227 |
+
('Text Expansion', "kendall's tau"): [0.1281, 0.0635, 0.2716, 0.3864],
|
228 |
+
('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
|
229 |
+
('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
|
230 |
+
('Summary Generation', "kendall's tau"): [0.3957, 0.2688, 0.5092, 0.5001],
|
231 |
+
('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
|
232 |
+
('Non-Factoid QA', "accuracy"): [0.5935, 0.5817, 0.7000, 0.7203],
|
233 |
+
('Non-Factoid QA', "kendall's tau"): [0.2332, 0.2389, 0.4440, 0.4235],
|
234 |
+
('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
|
235 |
}
|
236 |
+
overall = [0, 0, 0, 0]
|
237 |
+
for d in data:
|
238 |
+
if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
|
239 |
+
for i in range(4):
|
240 |
+
overall[i] += data[d][i]
|
241 |
+
overall = [i / (3*4) for i in overall]
|
242 |
+
data[('', 'overall')] = overall
|
243 |
+
for d in data:
|
244 |
+
for col in d.select_dtypes(include=['float64', 'int64']).columns:
|
245 |
+
d[col] = d[col].apply(lambda x: f"{x:.4f}")
|
246 |
+
st.dataframe(data, use_container_width=True)
|
247 |
+
# # teamId 唯一标识码
|
248 |
+
# DG = {
|
249 |
+
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
250 |
+
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
251 |
+
# "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
|
252 |
+
# "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
|
253 |
+
# "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
|
254 |
+
# }
|
255 |
|
256 |
+
# df1 = pd.DataFrame(DG)
|
257 |
|
258 |
+
# TE = {
|
259 |
+
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
260 |
+
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
261 |
+
# "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
|
262 |
+
# "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
|
263 |
+
# "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
|
264 |
+
# }
|
265 |
+
# df2 = pd.DataFrame(TE)
|
266 |
|
267 |
+
# SG = {
|
268 |
+
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
269 |
+
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
270 |
+
# "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
|
271 |
+
# "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
|
272 |
+
# "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
|
273 |
+
# }
|
274 |
+
# df3 = pd.DataFrame(SG)
|
275 |
|
276 |
+
# NFQA = {
|
277 |
+
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
278 |
+
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
279 |
+
# "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
|
280 |
+
# "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
|
281 |
+
# "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
|
282 |
+
# }
|
283 |
+
# df4 = pd.DataFrame(NFQA)
|
284 |
|
285 |
+
# df = [df1, df2, df3, df4]
|
286 |
+
# for d in df:
|
287 |
+
# for col in d.select_dtypes(include=['float64', 'int64']).columns:
|
288 |
+
# d[col] = d[col].apply(lambda x: f"{x:.4f}")
|
289 |
|
290 |
# 创建标签页
|
291 |
+
# tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
292 |
|
293 |
+
# with tab1:
|
294 |
+
# st.markdown("""<p class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</p>""", unsafe_allow_html=True)
|
295 |
+
# st.dataframe(df1, use_container_width=True)
|
296 |
|
297 |
+
# with tab2:
|
298 |
+
# st.markdown("""<p class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</p>""", unsafe_allow_html=True)
|
299 |
+
# st.dataframe(df2, use_container_width=True)
|
300 |
|
301 |
+
# with tab3:
|
302 |
+
# st.markdown("""<p class='main-text'>Task: Summary Generation; Dataset: Xsum</p>""", unsafe_allow_html=True)
|
303 |
+
# st.dataframe(df3, use_container_width=True)
|
304 |
|
305 |
+
# with tab4:
|
306 |
+
# st.markdown("""<p class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</p>""", unsafe_allow_html=True)
|
307 |
+
# st.dataframe(df4, use_container_width=True)
|
308 |
st.markdown("A baseline example can be found in the [baseline_example](https://huggingface.co/spaces/THUIR/AEOLLM/tree/main/baseline_example) folder.")
|
309 |
# 获取北京时间
|
310 |
time_placeholder = st.empty()
|
test.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data = {
|
2 |
+
('', 'teamId'): ['baseline', 'baseline', 'baseline', 'baseline'],
|
3 |
+
('', 'methods'): ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
4 |
+
('', 'overall'): [],
|
5 |
+
('Dialogue Generation', 'accuracy'): [0.5806, 0.5483, 0.6001, 0.6472],
|
6 |
+
('Dialogue Generation', "kendall's tau"): [0.3243, 0.1739, 0.3042, 0.4167],
|
7 |
+
('Dialogue Generation', 'spearman'): [0.3505, 0.1857, 0.3264, 0.4512],
|
8 |
+
('Text Expansion', "accuracy"): [0.5107, 0.5050, 0.5461, 0.5581],
|
9 |
+
('Text Expansion', "kendall's tau"): [0.1281, 0.0635, 0.2716, 0.3864],
|
10 |
+
('Text Expansion', 'spearman'): [0.1352, 0.0667, 0.2867, 0.4157],
|
11 |
+
('Summary Generation', 'accuracy'): [0.6504, 0.6014, 0.7162, 0.7441],
|
12 |
+
('Summary Generation', "kendall's tau"): [0.3957, 0.2688, 0.5092, 0.5001],
|
13 |
+
('Summary Generation', 'spearman'): [0.4188, 0.2817, 0.5403, 0.5405],
|
14 |
+
('Non-Factoid QA', "accuracy"): [0.5935, 0.5817, 0.7000, 0.7203],
|
15 |
+
('Non-Factoid QA', "kendall's tau"): [0.2332, 0.2389, 0.4440, 0.4235],
|
16 |
+
('Non-Factoid QA', 'spearman'): [0.2443, 0.2492, 0.4630, 0.4511]
|
17 |
+
}
|
18 |
+
|
19 |
+
overall = [0, 0, 0, 0]
|
20 |
+
|
21 |
+
for d in data:
|
22 |
+
if d != ('', 'teamId') and d != ('', 'methods') and d != ('', 'overall'):
|
23 |
+
for i in range(4):
|
24 |
+
overall[i] += data[d][i]
|
25 |
+
|
26 |
+
overall = [i / (3*4) for i in overall]
|
27 |
+
print(overall)
|