Spaces:
Runtime error
Runtime error
Junetheriver
commited on
Commit
•
22cd459
1
Parent(s):
cb9ec9c
added qa leaderboards
Browse files- app.py +7 -0
- data/bosc_zh_qa.csv +19 -0
- data/owl_en_qa.csv +18 -0
- data/owl_zh_qa.csv +18 -0
- data/rzy_zh_qa.csv +18 -0
- data/zabbix_zh_qa.csv +18 -0
- data/zjyd_zh_qa.csv +18 -11
- leaderboard/wired_network.csv +0 -3
- leaderboard/wired_network_en.csv +0 -28
- leaderboard/wired_network_zh.csv +0 -31
- leaderboards.py +5 -3
- opseval_datasets.py +6 -3
app.py
CHANGED
@@ -58,10 +58,17 @@ def process_mc_df(df, shot=None):
|
|
58 |
df = df.reset_index()
|
59 |
return df
|
60 |
|
|
|
|
|
|
|
|
|
|
|
61 |
def dataframe_to_gradio(df, is_mc=True, shot=None):
|
62 |
|
63 |
if is_mc:
|
64 |
df = process_mc_df(df, shot)
|
|
|
|
|
65 |
headers = df.columns
|
66 |
# types = ["str"] + ["number"] * (len(headers) - 1)
|
67 |
|
|
|
58 |
df = df.reset_index()
|
59 |
return df
|
60 |
|
61 |
+
def process_qa_df(df):
|
62 |
+
# 保留小数点后四位
|
63 |
+
df = df.round(4)
|
64 |
+
return df
|
65 |
+
|
66 |
def dataframe_to_gradio(df, is_mc=True, shot=None):
|
67 |
|
68 |
if is_mc:
|
69 |
df = process_mc_df(df, shot)
|
70 |
+
else:
|
71 |
+
df = process_qa_df(df)
|
72 |
headers = df.columns
|
73 |
# types = ["str"] + ["number"] * (len(headers) - 1)
|
74 |
|
data/bosc_zh_qa.csv
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
|
2 |
+
Yi-6B-Chat,0.7727272727272728,0.39624895698624696,0.7542070590484817,0.8510823274180981
|
3 |
+
Internlm2-Chat-20B,0.4974937343358396,0.23934623278394898,0.6204328631578558,0.8259980417246064
|
4 |
+
Yi-6B,0.14285714285714285,0.5190011151439459,0.613405924657078,0.7862767598528019
|
5 |
+
Qwen1.5-0.5B-Chat,0.42857142857142855,0.488269773660657,0.5753016779392393,0.8382536957664072
|
6 |
+
Baichuan2-13B-Chat,0.2857142857142857,0.7405192357284925,0.5708584068359933,0.8691557081776934
|
7 |
+
Internlm2-Chat-7B,0.5,0.22600361749502174,0.570841424928178,0.835409602287096
|
8 |
+
Gemma-2B,0.011904761904761904,0.2803251363458862,0.557515859867883,0.8041128629884543
|
9 |
+
Qwen1.5-4B-Chat,0.42857142857142855,0.7503365683315548,0.5549539275574193,0.8483871388011062
|
10 |
+
Vicuna-13B-V1.5,0.7223381250590647,0.5398926752022207,0.5186514512565293,0.6634719608098028
|
11 |
+
Yi-9B,0.25,0.31590509156704094,0.4925135259219739,0.7803641425669777
|
12 |
+
Baichuan2-7B-Chat,0.42857142857142855,0.5020120070047182,0.4880968089375992,0.8529672546477044
|
13 |
+
Vicuna-7B-V1.5,0.14285714285714285,0.5019194353900974,0.4722927782884535,0.8320120749350508
|
14 |
+
Gpt-3.5-Turbo,0.6428571428571429,0.7368772770427375,0.46705283670036873,0.8518465563087741
|
15 |
+
Qwen1.5-1.8B-Chat,0.40816326530612246,0.6187149363151768,0.46468776283363905,0.8536228462063511
|
16 |
+
Qwen1.5-14B-Chat,0.5,0.62189263577569,0.44546489401960593,0.8573697801600568
|
17 |
+
Qwen1.5-7B-Chat,0.5714285714285714,0.6304668631060011,0.4380453591507842,0.8549025250385108
|
18 |
+
Gemma-7B,0.25,0.09449475661345018,0.4189708246631338,0.7939624827076257
|
19 |
+
Mistral-7B,0.14285714285714285,0.10603847794715389,0.3972735247665132,0.7962369562089103
|
data/owl_en_qa.csv
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
|
2 |
+
Yi-6B-Chat,0.7030488252628491,0.8539968939451946,0.8345303291521267,0.8525487106316666
|
3 |
+
Qwen1.5-7B-Chat,0.7730288486126379,0.800263654571703,0.7701434694510604,0.8096226126029853
|
4 |
+
Vicuna-13B-V1.5,0.7252900133287701,0.8001237927827157,0.7513908712676198,0.8129508434301099
|
5 |
+
Gpt-3.5-Turbo,0.7601832993890021,0.9508419461436827,0.5945237008802382,0.8766794953769677
|
6 |
+
Qwen1.5-14B-Chat,0.8108575380359613,0.9186608285564491,0.5796170633787928,0.8627921465440817
|
7 |
+
Internlm2-Chat-20B,0.8823450852329868,0.8990122482812408,0.5663255561571012,0.817104818105292
|
8 |
+
Internlm2-Chat-7B,0.8716367920317769,0.9049173556355747,0.5566486218868514,0.8194293421446569
|
9 |
+
Vicuna-7B-V1.5,0.6687478686911705,0.8847336678547908,0.5491987169778965,0.8538950235036584
|
10 |
+
Qwen1.5-4B-Chat,0.7161414565826331,0.916949622281115,0.5415164042157119,0.8588077047327288
|
11 |
+
Qwen1.5-1.8B-Chat,0.7559747023809523,0.9469277644039529,0.5355121893517637,0.8511550798429494
|
12 |
+
Baichuan2-13B-Chat,0.724778459441036,0.9033782254193811,0.5324917996259314,0.8430175816264579
|
13 |
+
Baichuan2-7B-Chat,0.663319530710835,0.8543448236955469,0.5222686618152338,0.8364213008907668
|
14 |
+
Gemma-7B,0.5647578582126265,0.6814204309035338,0.5202336438594105,0.7806024397207423
|
15 |
+
Qwen1.5-0.5B-Chat,0.5679874805086168,0.8611226406276706,0.513748281764636,0.812332476681601
|
16 |
+
Mistral-7B,0.6586367313915859,0.7039079054469578,0.5078017923324171,0.7902698697096028
|
17 |
+
Gemma-2B,0.5049161881111284,0.6528267517862424,0.5059908632023802,0.7736166726699579
|
18 |
+
Yi-6B,0.5063160585604476,0.6749962990823568,0.49929516708962135,0.7789524853407436
|
data/owl_zh_qa.csv
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
|
2 |
+
Yi-6B-Chat,0.7600815667858686,0.7912540642980099,0.7610564816532298,0.6761740477117962
|
3 |
+
Qwen1.5-7B-Chat,0.7231431514350063,0.6963952727655642,0.7350526525675435,0.6957602523500673
|
4 |
+
Internlm2-Chat-7B,0.46693736812083075,0.30772894154056324,0.6038537202079961,0.7424094998006903
|
5 |
+
Gpt-3.5-Turbo,0.42171467364950055,0.8691295573045116,0.6038112679436712,0.870169867075605
|
6 |
+
Vicuna-13B-V1.5,0.613175326243208,0.682444153024839,0.5924435357123905,0.6493806776292096
|
7 |
+
Qwen1.5-14B-Chat,0.4991210277214334,0.7953872813217998,0.5891222801836271,0.8588771162081673
|
8 |
+
Internlm2-Chat-20B,0.7918762142818747,0.6187939679354695,0.5756624792803415,0.8195634463645642
|
9 |
+
Baichuan2-13B-Chat,0.4038672142368241,0.8192887757169468,0.5623602404354114,0.8562630521329339
|
10 |
+
Qwen1.5-4B-Chat,0.38204865489701556,0.8389697689558571,0.5521128189924648,0.8573317706502359
|
11 |
+
Gemma-7B,0.4059392201442353,0.35160449208958283,0.5377158689736348,0.7911868222195938
|
12 |
+
Qwen1.5-1.8B-Chat,0.5491781930806321,0.769059886385716,0.5344330706846868,0.8437705436461957
|
13 |
+
Baichuan2-7B-Chat,0.3683127572016461,0.7749038932071436,0.5101570739448591,0.8504995667294786
|
14 |
+
Yi-6B,0.366171888675488,0.35434640725576727,0.48557644112672105,0.7941020553273606
|
15 |
+
Qwen1.5-0.5B-Chat,0.2617687074829932,0.7710346032394836,0.4777335192926036,0.8346118169208395
|
16 |
+
Gemma-2B,0.45988486660889083,0.33416817486815076,0.4592030663597664,0.744093765199015
|
17 |
+
Mistral-7B,0.6409280685903916,0.4271051397084787,0.453115837113527,0.6903975251587856
|
18 |
+
Vicuna-7B-V1.5,0.39214979579762405,0.7771452487068145,0.44308513423549384,0.819478102998531
|
data/rzy_zh_qa.csv
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
|
2 |
+
Qwen1.5-7B-Chat,0.736588808239606,0.7333071013844119,0.7653953121660069,0.7581706400802051
|
3 |
+
Yi-6B-Chat,0.8623512542497523,0.8477785816251643,0.7511472076832977,0.6674107034023887
|
4 |
+
Internlm2-Chat-20B,0.7579980374579645,0.5592058560996795,0.6230274906657811,0.8809475791391357
|
5 |
+
Qwen1.5-1.8B-Chat,0.6993731684981686,0.6912058643011517,0.5921887447381617,0.895549081535767
|
6 |
+
Gemma-2B,0.4793590757810048,0.269158909387365,0.5715789320903422,0.8084277426065398
|
7 |
+
Yi-6B,0.39791261211082174,0.35010016811319283,0.5554912360774231,0.8151911650862558
|
8 |
+
Internlm2-Chat-7B,0.6057084170408346,0.28950531392496315,0.5513185635050407,0.8527933874140172
|
9 |
+
Vicuna-13B-V1.5,0.6149588477366256,0.7175132054894446,0.5484350782035007,0.8846316742953054
|
10 |
+
Gpt-3.5-Turbo,0.6702526487367563,0.8535199907928265,0.5380443637081317,0.9113351056689803
|
11 |
+
Baichuan2-7B-Chat,0.6457107843137256,0.7989283627012825,0.5355149927949222,0.8918899008657395
|
12 |
+
Qwen1.5-14B-Chat,0.7039449112978525,0.7891124698018288,0.5351538957435175,0.9060753469650263
|
13 |
+
Qwen1.5-4B-Chat,0.6079656862745099,0.798414770802262,0.5349164010626877,0.8926774424126845
|
14 |
+
Vicuna-7B-V1.5,0.5618038576473784,0.7385375964159062,0.5346381268062822,0.8785135365491068
|
15 |
+
Qwen1.5-0.5B-Chat,0.5161804573314475,0.7335961705843393,0.5329134165403151,0.878000802003553
|
16 |
+
Baichuan2-13B-Chat,0.6229674796747967,0.8122416536307804,0.5111467259298673,0.8969644779921856
|
17 |
+
Gemma-7B,0.6952392516403653,0.42448628847691194,0.4304401424621823,0.6676771540611001
|
18 |
+
Mistral-7B,0.8060009447278426,0.5415825155389061,0.39330883346357015,0.5908077476385994
|
data/zabbix_zh_qa.csv
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
|
2 |
+
Qwen1.5-7B-Chat,0.7784179666191972,0.6446005816609537,0.754576907135137,0.685739970384688
|
3 |
+
Gemma-2B,0.24509853619131058,0.3864916275690845,0.7478997831333918,0.5331677452195527
|
4 |
+
Qwen1.5-1.8B-Chat,0.6049019607843137,0.548904353980019,0.7198411238809446,0.9040869829649206
|
5 |
+
Yi-6B-Chat,0.7810660719751629,0.7167976589186875,0.6766795202404136,0.9015276627615246
|
6 |
+
Gpt-3.5-Turbo,0.7246376811594203,0.8043651144702015,0.6509014034090643,0.9218230132938211
|
7 |
+
Vicuna-7B-V1.5,0.4022032693674485,0.6008780718971581,0.6017002573832742,0.8863580258321797
|
8 |
+
Qwen1.5-14B-Chat,0.6328502415458936,0.7221033096305126,0.5966395914029399,0.9145067669966427
|
9 |
+
Qwen1.5-4B-Chat,0.5223151244890376,0.7184562339362471,0.5920241633149731,0.9031485610681586
|
10 |
+
Qwen1.5-0.5B-Chat,0.4166666666666667,0.6888304890847555,0.5800664845337345,0.8918967698708089
|
11 |
+
Internlm2-Chat-20B,0.7642667437926058,0.4826757943830156,0.5799585238847701,0.8876497593181047
|
12 |
+
Baichuan2-7B-Chat,0.48357487922705317,0.6222193378376162,0.578165574028535,0.9011250283968237
|
13 |
+
Vicuna-13B-V1.5,0.5991387785360396,0.6932977841508635,0.5747798091575604,0.8409686000423541
|
14 |
+
Baichuan2-13B-Chat,0.4896135265700483,0.7843769138572264,0.5653592173980313,0.9083387421281699
|
15 |
+
Internlm2-Chat-7B,0.558064058956916,0.10309630273051296,0.5526199329356135,0.8526944078477274
|
16 |
+
Yi-6B,0.3099052131839017,0.3558360880812697,0.5458649977309493,0.8362572777967558
|
17 |
+
Mistral-7B,0.6256150793650794,0.29009866821782115,0.5221570098966367,0.8399808519337731
|
18 |
+
Gemma-7B,0.4451515151515152,0.3045735267275342,0.5168775971677172,0.8368893516841295
|
data/zjyd_zh_qa.csv
CHANGED
@@ -1,11 +1,18 @@
|
|
1 |
-
name,
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
Qwen1.5-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
Qwen1.5-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
|
2 |
+
Qwen1.5-7B-Chat,0.9001343784903166,0.8310295209988445,0.9078620109210616,0.8729269504884732
|
3 |
+
Qwen1.5-4B-Chat,0.8871057360749349,0.8993513260808733,0.8307124664634027,0.820558540099676
|
4 |
+
Gpt-3.5-Turbo,0.844,0.9381622494626971,0.7014364361923093,0.9646698567096922
|
5 |
+
Qwen1.5-14B-Chat,0.9143835616438356,0.9096799403053909,0.6924722613000067,0.9576526234420509
|
6 |
+
Vicuna-13B-V1.5,0.8201754385964912,0.8811313951272425,0.68730238555884,0.9481117257306625
|
7 |
+
Baichuan2-13B-Chat,0.902,0.9045473946630944,0.6857149093288882,0.9578530098977669
|
8 |
+
Baichuan2-7B-Chat,0.8784722222222222,0.896849978755001,0.6751955501292016,0.9500810985536641
|
9 |
+
Qwen1.5-1.8B-Chat,0.9148888888888888,0.8586071776868396,0.6748854449858851,0.947046701753897
|
10 |
+
Yi-6B-Chat,0.9511929511929511,0.7986143744572479,0.6694793902546,0.9285801614165997
|
11 |
+
Qwen1.5-0.5B-Chat,0.8277777777777777,0.8901546106376419,0.6588250813657541,0.9469939028778743
|
12 |
+
Vicuna-7B-V1.5,0.7171052631578947,0.8301247992194959,0.6521358982668551,0.9382112592746454
|
13 |
+
Internlm2-Chat-20B,0.8146430093452255,0.6294665932615476,0.5592223065723815,0.9031372380769384
|
14 |
+
Internlm2-Chat-7B,0.7936354405828091,0.6059388264548148,0.5497547973508542,0.9071347182079667
|
15 |
+
Gemma-7B,0.5690370087428911,0.294443307376398,0.5182431858082619,0.8437500469275063
|
16 |
+
Yi-6B,0.4679211960033877,0.29049526322106994,0.4910372529026469,0.8409424204038982
|
17 |
+
Mistral-7B,0.7985507246376812,0.40909012863946165,0.4698180894318443,0.85180274226269
|
18 |
+
Gemma-2B,0.5461295296041059,0.32955654240675497,0.4138425436194475,0.8085919354670669
|
leaderboard/wired_network.csv
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
Model, Zeroshot, Fewshot(3-shot), Best Score
|
2 |
-
ChatGPT, 80, 80, 80
|
3 |
-
GPT-3, 23, 23, 23
|
|
|
|
|
|
|
|
leaderboard/wired_network_en.csv
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
-,-,Zero-shot,-,v,-,3-Shot,-,-,-,-
|
2 |
-
-,-,Naïve,SC,CoT,CoT+SC,Naïve,SC,CoT,CoT+SC,Best Score
|
3 |
-
1,✨ GPT-4,/,/,/,/,/,/,88.7,88.7,88.7
|
4 |
-
2,✨ Yi-34B-Chat,57.75,59.14,65.11,68.79,68.16,68.37,78.09,80.06,80.06
|
5 |
-
3,✨ Qwen-72B-Chat,70.41,70.5,72.38,72.56,70.32,70.32,70.13,70.22,72.56
|
6 |
-
4,✨ GPT-3.5-turbo,66.6,66.8,69.6,72,68.3,68.3,70.9,72.5,72.5
|
7 |
-
5,✨ ERNIE-Bot-4.0,61.15,61.15,70,70,60,60,70,70,70
|
8 |
-
6,✨ qwen1.5-14b-chat,54.9,56.44,64.09,67.1,52.23,53.52,59.54,64.18,67.1
|
9 |
-
7,✨ qwen1.5-14b-base,34.88,34.88,60.82,60.82,65.55,65.55,47.08,47.08,65.55
|
10 |
-
8,✨ DevOps-Model-14B-Chat,30.69,30.59,55.77,63.63,63.85,61.96,41.15,44.01,63.85
|
11 |
-
9,✨ Qwen-14B-Chat,43.78,47.81,56.58,59.4,62.09,59.7,49.06,55.88,62.09
|
12 |
-
10,✨ LLaMA-2-13B,41.8,46.5,53.1,58.7,53.3,53,56.8,61,61
|
13 |
-
11,✨ InternLM2-Chat-20B,56.36,56.36,26.18,26.18,60.48,60.48,45.1,45.1,60.48
|
14 |
-
12,✨ LLaMA-2-70B-Chat,25.29,25.29,57.97,58.06,52.97,52.97,58.55,58.55,58.55
|
15 |
-
13,✨ InternLM2-Chat-7B,49.74,49.74,56.19,56.19,48.2,48.2,49.74,49.74,56.19
|
16 |
-
14,✨ LLaMA-2-7B,39.5,40,45.4,49.5,48.2,46.8,52,55.2,55.2
|
17 |
-
15,✨ Qwen-7B-Chat,45.9,46,47.3,50.1,52.1,51,48.3,49.8,52.1
|
18 |
-
16,✨ gemma_7b,25.09,25.09,50.86,50.86,30.24,30.24,51.56,51.56,51.56
|
19 |
-
17,✨ InternLM-7B,38.7,38.7,43.9,43.9,45.2,45.2,51.4,51.4,51.4
|
20 |
-
18,✨ Chinese-Alpaca-2-13B,37.7,37.7,49.7,49.7,48.6,48.6,50.5,50.5,50.5
|
21 |
-
19,✨ Mistral-7B,29.27,29.27,46.3,46.3,47.22,47.22,45.58,45.58,47.22
|
22 |
-
20,✨ AquilaChat2-34B,36.63,36.63,44.83,44.83,46.65,46.65,NULL,NULL,46.65
|
23 |
-
21,✨ ChatGLM3-6B,43.38,43.38,44.59,44.59,42.1,42.1,43.47,43.47,44.59
|
24 |
-
22,✨ ChatGLM2-6B,24.8,24.7,36.6,36.5,37.6,37.6,40.5,40.5,40.5
|
25 |
-
23,✨ Chinese-LLaMA-2-13B,29.4,29.4,37.8,37.8,40.4,40.4,28.8,28.8,40.4
|
26 |
-
24,✨ gemma_2b,26.46,26.46,33.42,33.42,26.63,26.63,37.54,37.54,37.54
|
27 |
-
25,✨ Baichuan-13B-Chat,18.3,20.4,28.6,37,24.1,26.7,18.2,17.8,37
|
28 |
-
26,✨ Baichuan2-13B-Chat,14.1,15.3,24.1,25.8,32.3,33.1,25.6,27.7,33.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard/wired_network_zh.csv
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
-,-,Zero-shot,-,v,-,3-Shot,-,-,-,-
|
2 |
-
-,-,Naïve,SC,CoT,CoT+SC,Naïve,SC,CoT,CoT+SC,Best Score
|
3 |
-
1,✨ GPT-4,/,/,/,/,/,/,86,86,86
|
4 |
-
2,✨ ERNIE-Bot-4.0,67.54,67.54,71.96,71.96,72,72,78,78,78
|
5 |
-
3,✨ Yi-34B-Chat,61.61,62.56,68.11,69.75,65.73,65.37,69.88,71.21,71.21
|
6 |
-
4,✨ Qwen-72B-Chat,65.77,65.86,68.13,68.3,69.4,69.4,69.99,70.08,70.08
|
7 |
-
5,✨ Hunyuan-13B,60,60,70,70,,,,,70
|
8 |
-
6,✨ GPT-3.5-turbo,58.4,58.6,64.8,67.6,59.2,59.7,65.2,67.4,67.6
|
9 |
-
7,✨ GLM4,67.38,67.38,,,,,,,67.38
|
10 |
-
8,✨ qwen1.5-14b-chat,54.04,53.87,62.56,63.86,58.78,58.09,63.43,65.58,65.58
|
11 |
-
9,✨ DevOps-Model-14B-Chat,47.59,46.57,52.52,56.01,62.07,60.08,50.59,55.79,62.07
|
12 |
-
10,✨ qwen1.5-14b-base,45.18,45.18,59.12,59.12,61.1,61.1,52.5,52.5,61.1
|
13 |
-
11,✨ InternLM2-Chat-7B,54.3,54.3,59.81,59.81,58.52,58.52,51.64,51.64,59.81
|
14 |
-
12,✨ GLM3-turbo,59.64,59.64,,,,,,,59.64
|
15 |
-
13,✨ InternLM2-Chat-20B,57.49,57.49,57.14,57.14,59.12,59.12,50.77,50.77,59.12
|
16 |
-
14,✨ Qwen-14B-Chat,48.35,48.81,55.35,57.4,58.53,56.12,52.12,54.99,58.53
|
17 |
-
15,✨ LLaMA-2-70B-Chat,38.55,38.55,57.49,57.49,49.09,49.09,48.57,48.57,57.49
|
18 |
-
16,✨ LLaMA-2-13B,29.7,31.6,51.6,57,39.6,38.9,48,50.6,57
|
19 |
-
17,✨ Baichuan-13B-Chat,15.2,16,43.9,49.7,34.3,36.1,51.3,55.6,55.6
|
20 |
-
18,✨ LLaMA-2-7B,29.8,30.2,50.1,55.6,38.6,40.8,45.6,50.4,55.6
|
21 |
-
19,✨ Qwen-7B-Chat,29.6,29.9,50.6,53.5,50.4,46.9,46.9,47.7,53.5
|
22 |
-
20,✨ ChatGLM3-6B,41.39,41.39,49.23,49.23,38.81,38.81,42.86,42.86,49.23
|
23 |
-
21,✨ gemma_7b,31.58,31.58,47.59,47.59,34.68,34.68,48.88,48.88,48.88
|
24 |
-
22,✨ AquilaChat2-34B,34.66,34.66,47.74,47.74,44.48,44.48,NULL,NULL,47.74
|
25 |
-
23,✨ Mistral-7B,1.9,1.9,45.61,45.61,15,15,35.97,35.97,45.61
|
26 |
-
24,✨ Chinese-Alpaca-2-13B,33.1,33.1,44.2,44.2,44,44,42.7,42.7,44.2
|
27 |
-
25,✨ InternLM-7B,41.7,41.7,38.4,38.4,42.6,42.6,41.3,41.3,42.6
|
28 |
-
26,✨ ChatGLM2-6B,33.8,33.7,42.1,42.2,36,36,39.5,39.5,42.2
|
29 |
-
27,✨ Chinese-LLaMA-2-13B,22.5,22.5,38.8,38.8,41.8,41.8,32.2,32.2,41.8
|
30 |
-
28,✨ gemma_2b,29.69,29.69,39.16,39.16,29.78,29.78,38.64,38.64,39.16
|
31 |
-
29,✨ Baichuan2-13B-Chat,35.6,35.9,30.5,30.5,34.6,35.6,30.2,32,35.9
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboards.py
CHANGED
@@ -3,6 +3,7 @@ eng_leaderboards = [
|
|
3 |
('lenovo', ['mc']),
|
4 |
('oracle', ['mc']),
|
5 |
('network', ['mc', 'qa']),
|
|
|
6 |
]
|
7 |
|
8 |
chi_leaderboards = [
|
@@ -11,12 +12,13 @@ chi_leaderboards = [
|
|
11 |
('zjyd', ['mc', 'qa']),
|
12 |
('network', ['mc', 'qa']),
|
13 |
('pufa', ['mc']),
|
14 |
-
('zabbix', ['mc']),
|
15 |
('dfcdata', ['mc']),
|
16 |
('zte', ['mc']),
|
17 |
('oracle', ['mc']),
|
18 |
('tencent', ['qa']),
|
19 |
-
('bosc', ['mc']),
|
20 |
-
('rzy', ['mc']),
|
21 |
('lenovo', ['mc']),
|
|
|
22 |
]
|
|
|
3 |
('lenovo', ['mc']),
|
4 |
('oracle', ['mc']),
|
5 |
('network', ['mc', 'qa']),
|
6 |
+
('owl', ['qa']),
|
7 |
]
|
8 |
|
9 |
chi_leaderboards = [
|
|
|
12 |
('zjyd', ['mc', 'qa']),
|
13 |
('network', ['mc', 'qa']),
|
14 |
('pufa', ['mc']),
|
15 |
+
('zabbix', ['mc', 'qa']),
|
16 |
('dfcdata', ['mc']),
|
17 |
('zte', ['mc']),
|
18 |
('oracle', ['mc']),
|
19 |
('tencent', ['qa']),
|
20 |
+
('bosc', ['mc', 'qa']),
|
21 |
+
('rzy', ['mc', 'qa']),
|
22 |
('lenovo', ['mc']),
|
23 |
+
('owl', ['qa']),
|
24 |
]
|
opseval_datasets.py
CHANGED
@@ -11,7 +11,8 @@ datasets_abbr = [
|
|
11 |
'tencent',
|
12 |
'bosc',
|
13 |
'rzy',
|
14 |
-
'lenovo'
|
|
|
15 |
]
|
16 |
|
17 |
datasets_zh = [
|
@@ -27,7 +28,8 @@ datasets_zh = [
|
|
27 |
'DevOps能力评测(腾讯)',
|
28 |
'金融信创系统运维(上海银行)',
|
29 |
'日志分析能力评测(日志易)',
|
30 |
-
'混合云建设与运维(联想集团)'
|
|
|
31 |
]
|
32 |
|
33 |
datasets_en = [
|
@@ -43,7 +45,8 @@ datasets_en = [
|
|
43 |
"DevOps Capability",
|
44 |
"Financial New Generation System",
|
45 |
"Log Analysis",
|
46 |
-
"Hybrid Cloud Construction and Operations"
|
|
|
47 |
]
|
48 |
|
49 |
dataset_abbr_zh_dict = {
|
|
|
11 |
'tencent',
|
12 |
'bosc',
|
13 |
'rzy',
|
14 |
+
'lenovo',
|
15 |
+
'owl',
|
16 |
]
|
17 |
|
18 |
datasets_zh = [
|
|
|
28 |
'DevOps能力评测(腾讯)',
|
29 |
'金融信创系统运维(上海银行)',
|
30 |
'日志分析能力评测(日志易)',
|
31 |
+
'混合云建设与运维(联想集团)',
|
32 |
+
'OWL',
|
33 |
]
|
34 |
|
35 |
datasets_en = [
|
|
|
45 |
"DevOps Capability",
|
46 |
"Financial New Generation System",
|
47 |
"Log Analysis",
|
48 |
+
"Hybrid Cloud Construction and Operations",
|
49 |
+
"OWL",
|
50 |
]
|
51 |
|
52 |
dataset_abbr_zh_dict = {
|