Junetheriver commited on
Commit
22cd459
1 Parent(s): cb9ec9c

added qa leaderboards

Browse files
app.py CHANGED
@@ -58,10 +58,17 @@ def process_mc_df(df, shot=None):
58
  df = df.reset_index()
59
  return df
60
 
 
 
 
 
 
61
  def dataframe_to_gradio(df, is_mc=True, shot=None):
62
 
63
  if is_mc:
64
  df = process_mc_df(df, shot)
 
 
65
  headers = df.columns
66
  # types = ["str"] + ["number"] * (len(headers) - 1)
67
 
 
58
  df = df.reset_index()
59
  return df
60
 
61
+ def process_qa_df(df):
62
+ # 保留小数点后四位
63
+ df = df.round(4)
64
+ return df
65
+
66
  def dataframe_to_gradio(df, is_mc=True, shot=None):
67
 
68
  if is_mc:
69
  df = process_mc_df(df, shot)
70
+ else:
71
+ df = process_qa_df(df)
72
  headers = df.columns
73
  # types = ["str"] + ["number"] * (len(headers) - 1)
74
 
data/bosc_zh_qa.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
2
+ Yi-6B-Chat,0.7727272727272728,0.39624895698624696,0.7542070590484817,0.8510823274180981
3
+ Internlm2-Chat-20B,0.4974937343358396,0.23934623278394898,0.6204328631578558,0.8259980417246064
4
+ Yi-6B,0.14285714285714285,0.5190011151439459,0.613405924657078,0.7862767598528019
5
+ Qwen1.5-0.5B-Chat,0.42857142857142855,0.488269773660657,0.5753016779392393,0.8382536957664072
6
+ Baichuan2-13B-Chat,0.2857142857142857,0.7405192357284925,0.5708584068359933,0.8691557081776934
7
+ Internlm2-Chat-7B,0.5,0.22600361749502174,0.570841424928178,0.835409602287096
8
+ Gemma-2B,0.011904761904761904,0.2803251363458862,0.557515859867883,0.8041128629884543
9
+ Qwen1.5-4B-Chat,0.42857142857142855,0.7503365683315548,0.5549539275574193,0.8483871388011062
10
+ Vicuna-13B-V1.5,0.7223381250590647,0.5398926752022207,0.5186514512565293,0.6634719608098028
11
+ Yi-9B,0.25,0.31590509156704094,0.4925135259219739,0.7803641425669777
12
+ Baichuan2-7B-Chat,0.42857142857142855,0.5020120070047182,0.4880968089375992,0.8529672546477044
13
+ Vicuna-7B-V1.5,0.14285714285714285,0.5019194353900974,0.4722927782884535,0.8320120749350508
14
+ Gpt-3.5-Turbo,0.6428571428571429,0.7368772770427375,0.46705283670036873,0.8518465563087741
15
+ Qwen1.5-1.8B-Chat,0.40816326530612246,0.6187149363151768,0.46468776283363905,0.8536228462063511
16
+ Qwen1.5-14B-Chat,0.5,0.62189263577569,0.44546489401960593,0.8573697801600568
17
+ Qwen1.5-7B-Chat,0.5714285714285714,0.6304668631060011,0.4380453591507842,0.8549025250385108
18
+ Gemma-7B,0.25,0.09449475661345018,0.4189708246631338,0.7939624827076257
19
+ Mistral-7B,0.14285714285714285,0.10603847794715389,0.3972735247665132,0.7962369562089103
data/owl_en_qa.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
2
+ Yi-6B-Chat,0.7030488252628491,0.8539968939451946,0.8345303291521267,0.8525487106316666
3
+ Qwen1.5-7B-Chat,0.7730288486126379,0.800263654571703,0.7701434694510604,0.8096226126029853
4
+ Vicuna-13B-V1.5,0.7252900133287701,0.8001237927827157,0.7513908712676198,0.8129508434301099
5
+ Gpt-3.5-Turbo,0.7601832993890021,0.9508419461436827,0.5945237008802382,0.8766794953769677
6
+ Qwen1.5-14B-Chat,0.8108575380359613,0.9186608285564491,0.5796170633787928,0.8627921465440817
7
+ Internlm2-Chat-20B,0.8823450852329868,0.8990122482812408,0.5663255561571012,0.817104818105292
8
+ Internlm2-Chat-7B,0.8716367920317769,0.9049173556355747,0.5566486218868514,0.8194293421446569
9
+ Vicuna-7B-V1.5,0.6687478686911705,0.8847336678547908,0.5491987169778965,0.8538950235036584
10
+ Qwen1.5-4B-Chat,0.7161414565826331,0.916949622281115,0.5415164042157119,0.8588077047327288
11
+ Qwen1.5-1.8B-Chat,0.7559747023809523,0.9469277644039529,0.5355121893517637,0.8511550798429494
12
+ Baichuan2-13B-Chat,0.724778459441036,0.9033782254193811,0.5324917996259314,0.8430175816264579
13
+ Baichuan2-7B-Chat,0.663319530710835,0.8543448236955469,0.5222686618152338,0.8364213008907668
14
+ Gemma-7B,0.5647578582126265,0.6814204309035338,0.5202336438594105,0.7806024397207423
15
+ Qwen1.5-0.5B-Chat,0.5679874805086168,0.8611226406276706,0.513748281764636,0.812332476681601
16
+ Mistral-7B,0.6586367313915859,0.7039079054469578,0.5078017923324171,0.7902698697096028
17
+ Gemma-2B,0.5049161881111284,0.6528267517862424,0.5059908632023802,0.7736166726699579
18
+ Yi-6B,0.5063160585604476,0.6749962990823568,0.49929516708962135,0.7789524853407436
data/owl_zh_qa.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
2
+ Yi-6B-Chat,0.7600815667858686,0.7912540642980099,0.7610564816532298,0.6761740477117962
3
+ Qwen1.5-7B-Chat,0.7231431514350063,0.6963952727655642,0.7350526525675435,0.6957602523500673
4
+ Internlm2-Chat-7B,0.46693736812083075,0.30772894154056324,0.6038537202079961,0.7424094998006903
5
+ Gpt-3.5-Turbo,0.42171467364950055,0.8691295573045116,0.6038112679436712,0.870169867075605
6
+ Vicuna-13B-V1.5,0.613175326243208,0.682444153024839,0.5924435357123905,0.6493806776292096
7
+ Qwen1.5-14B-Chat,0.4991210277214334,0.7953872813217998,0.5891222801836271,0.8588771162081673
8
+ Internlm2-Chat-20B,0.7918762142818747,0.6187939679354695,0.5756624792803415,0.8195634463645642
9
+ Baichuan2-13B-Chat,0.4038672142368241,0.8192887757169468,0.5623602404354114,0.8562630521329339
10
+ Qwen1.5-4B-Chat,0.38204865489701556,0.8389697689558571,0.5521128189924648,0.8573317706502359
11
+ Gemma-7B,0.4059392201442353,0.35160449208958283,0.5377158689736348,0.7911868222195938
12
+ Qwen1.5-1.8B-Chat,0.5491781930806321,0.769059886385716,0.5344330706846868,0.8437705436461957
13
+ Baichuan2-7B-Chat,0.3683127572016461,0.7749038932071436,0.5101570739448591,0.8504995667294786
14
+ Yi-6B,0.366171888675488,0.35434640725576727,0.48557644112672105,0.7941020553273606
15
+ Qwen1.5-0.5B-Chat,0.2617687074829932,0.7710346032394836,0.4777335192926036,0.8346118169208395
16
+ Gemma-2B,0.45988486660889083,0.33416817486815076,0.4592030663597664,0.744093765199015
17
+ Mistral-7B,0.6409280685903916,0.4271051397084787,0.453115837113527,0.6903975251587856
18
+ Vicuna-7B-V1.5,0.39214979579762405,0.7771452487068145,0.44308513423549384,0.819478102998531
data/rzy_zh_qa.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
2
+ Qwen1.5-7B-Chat,0.736588808239606,0.7333071013844119,0.7653953121660069,0.7581706400802051
3
+ Yi-6B-Chat,0.8623512542497523,0.8477785816251643,0.7511472076832977,0.6674107034023887
4
+ Internlm2-Chat-20B,0.7579980374579645,0.5592058560996795,0.6230274906657811,0.8809475791391357
5
+ Qwen1.5-1.8B-Chat,0.6993731684981686,0.6912058643011517,0.5921887447381617,0.895549081535767
6
+ Gemma-2B,0.4793590757810048,0.269158909387365,0.5715789320903422,0.8084277426065398
7
+ Yi-6B,0.39791261211082174,0.35010016811319283,0.5554912360774231,0.8151911650862558
8
+ Internlm2-Chat-7B,0.6057084170408346,0.28950531392496315,0.5513185635050407,0.8527933874140172
9
+ Vicuna-13B-V1.5,0.6149588477366256,0.7175132054894446,0.5484350782035007,0.8846316742953054
10
+ Gpt-3.5-Turbo,0.6702526487367563,0.8535199907928265,0.5380443637081317,0.9113351056689803
11
+ Baichuan2-7B-Chat,0.6457107843137256,0.7989283627012825,0.5355149927949222,0.8918899008657395
12
+ Qwen1.5-14B-Chat,0.7039449112978525,0.7891124698018288,0.5351538957435175,0.9060753469650263
13
+ Qwen1.5-4B-Chat,0.6079656862745099,0.798414770802262,0.5349164010626877,0.8926774424126845
14
+ Vicuna-7B-V1.5,0.5618038576473784,0.7385375964159062,0.5346381268062822,0.8785135365491068
15
+ Qwen1.5-0.5B-Chat,0.5161804573314475,0.7335961705843393,0.5329134165403151,0.878000802003553
16
+ Baichuan2-13B-Chat,0.6229674796747967,0.8122416536307804,0.5111467259298673,0.8969644779921856
17
+ Gemma-7B,0.6952392516403653,0.42448628847691194,0.4304401424621823,0.6676771540611001
18
+ Mistral-7B,0.8060009447278426,0.5415825155389061,0.39330883346357015,0.5908077476385994
data/zabbix_zh_qa.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
2
+ Qwen1.5-7B-Chat,0.7784179666191972,0.6446005816609537,0.754576907135137,0.685739970384688
3
+ Gemma-2B,0.24509853619131058,0.3864916275690845,0.7478997831333918,0.5331677452195527
4
+ Qwen1.5-1.8B-Chat,0.6049019607843137,0.548904353980019,0.7198411238809446,0.9040869829649206
5
+ Yi-6B-Chat,0.7810660719751629,0.7167976589186875,0.6766795202404136,0.9015276627615246
6
+ Gpt-3.5-Turbo,0.7246376811594203,0.8043651144702015,0.6509014034090643,0.9218230132938211
7
+ Vicuna-7B-V1.5,0.4022032693674485,0.6008780718971581,0.6017002573832742,0.8863580258321797
8
+ Qwen1.5-14B-Chat,0.6328502415458936,0.7221033096305126,0.5966395914029399,0.9145067669966427
9
+ Qwen1.5-4B-Chat,0.5223151244890376,0.7184562339362471,0.5920241633149731,0.9031485610681586
10
+ Qwen1.5-0.5B-Chat,0.4166666666666667,0.6888304890847555,0.5800664845337345,0.8918967698708089
11
+ Internlm2-Chat-20B,0.7642667437926058,0.4826757943830156,0.5799585238847701,0.8876497593181047
12
+ Baichuan2-7B-Chat,0.48357487922705317,0.6222193378376162,0.578165574028535,0.9011250283968237
13
+ Vicuna-13B-V1.5,0.5991387785360396,0.6932977841508635,0.5747798091575604,0.8409686000423541
14
+ Baichuan2-13B-Chat,0.4896135265700483,0.7843769138572264,0.5653592173980313,0.9083387421281699
15
+ Internlm2-Chat-7B,0.558064058956916,0.10309630273051296,0.5526199329356135,0.8526944078477274
16
+ Yi-6B,0.3099052131839017,0.3558360880812697,0.5458649977309493,0.8362572777967558
17
+ Mistral-7B,0.6256150793650794,0.29009866821782115,0.5221570098966367,0.8399808519337731
18
+ Gemma-7B,0.4451515151515152,0.3045735267275342,0.5168775971677172,0.8368893516841295
data/zjyd_zh_qa.csv CHANGED
@@ -1,11 +1,18 @@
1
- name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
- GPT-3.5-turbo,79.48718,81.19658,88.39286,89.28571 ,79.48717949,81.1965812,86.60714286,88.39285714
3
- Gemma-2B,26.49573,26.49573,62.5,62.5 ,58.97435897,58.97435897,75,75
4
- Gemma-7B,77.77778,77.77778,75.89286,75.89286 ,76.06837607,76.06837607,86.60714286,86.60714286
5
- Qwen1.5-0.5B-Base,65.17857,65.17857,75,75 ,56.25,56.25,57.14285714,57.14285714
6
- Qwen1.5-0.5B-Chat,0,0,54.46429,53.57143 ,20.53571429,19.64285714,16.96428571,17.85714286
7
- Qwen1.5-1.8B-Base,71.42857,71.42857,71.42857,71.42857 ,70.53571429,70.53571429,80.35714286,80.35714286
8
- Qwen1.5-1.8B-Chat,73.21429,69.64286,67.85714,77.67857 ,66.07142857,66.07142857,68.75,75
9
- Qwen1.5-14B-Base,76.92308,76.92308,88.39286,88.39286 ,78.63247863,78.63247863,83.03571429,83.03571429
10
- Qwen1.5-14B-Chat,18.75,23.21429,91.07143,92.85714 ,79.46428571,78.57142857,80.35714286,83.92857143
11
-
 
 
 
 
 
 
 
 
1
+ name,Faithfulness,Answer_Relevancy,Answer_Correctness,Answer_Similarity
2
+ Qwen1.5-7B-Chat,0.9001343784903166,0.8310295209988445,0.9078620109210616,0.8729269504884732
3
+ Qwen1.5-4B-Chat,0.8871057360749349,0.8993513260808733,0.8307124664634027,0.820558540099676
4
+ Gpt-3.5-Turbo,0.844,0.9381622494626971,0.7014364361923093,0.9646698567096922
5
+ Qwen1.5-14B-Chat,0.9143835616438356,0.9096799403053909,0.6924722613000067,0.9576526234420509
6
+ Vicuna-13B-V1.5,0.8201754385964912,0.8811313951272425,0.68730238555884,0.9481117257306625
7
+ Baichuan2-13B-Chat,0.902,0.9045473946630944,0.6857149093288882,0.9578530098977669
8
+ Baichuan2-7B-Chat,0.8784722222222222,0.896849978755001,0.6751955501292016,0.9500810985536641
9
+ Qwen1.5-1.8B-Chat,0.9148888888888888,0.8586071776868396,0.6748854449858851,0.947046701753897
10
+ Yi-6B-Chat,0.9511929511929511,0.7986143744572479,0.6694793902546,0.9285801614165997
11
+ Qwen1.5-0.5B-Chat,0.8277777777777777,0.8901546106376419,0.6588250813657541,0.9469939028778743
12
+ Vicuna-7B-V1.5,0.7171052631578947,0.8301247992194959,0.6521358982668551,0.9382112592746454
13
+ Internlm2-Chat-20B,0.8146430093452255,0.6294665932615476,0.5592223065723815,0.9031372380769384
14
+ Internlm2-Chat-7B,0.7936354405828091,0.6059388264548148,0.5497547973508542,0.9071347182079667
15
+ Gemma-7B,0.5690370087428911,0.294443307376398,0.5182431858082619,0.8437500469275063
16
+ Yi-6B,0.4679211960033877,0.29049526322106994,0.4910372529026469,0.8409424204038982
17
+ Mistral-7B,0.7985507246376812,0.40909012863946165,0.4698180894318443,0.85180274226269
18
+ Gemma-2B,0.5461295296041059,0.32955654240675497,0.4138425436194475,0.8085919354670669
leaderboard/wired_network.csv DELETED
@@ -1,3 +0,0 @@
1
- Model, Zeroshot, Fewshot(3-shot), Best Score
2
- ChatGPT, 80, 80, 80
3
- GPT-3, 23, 23, 23
 
 
 
 
leaderboard/wired_network_en.csv DELETED
@@ -1,28 +0,0 @@
1
- -,-,Zero-shot,-,v,-,3-Shot,-,-,-,-
2
- -,-,Naïve,SC,CoT,CoT+SC,Naïve,SC,CoT,CoT+SC,Best Score
3
- 1,✨ GPT-4,/,/,/,/,/,/,88.7,88.7,88.7
4
- 2,✨ Yi-34B-Chat,57.75,59.14,65.11,68.79,68.16,68.37,78.09,80.06,80.06
5
- 3,✨ Qwen-72B-Chat,70.41,70.5,72.38,72.56,70.32,70.32,70.13,70.22,72.56
6
- 4,✨ GPT-3.5-turbo,66.6,66.8,69.6,72,68.3,68.3,70.9,72.5,72.5
7
- 5,✨ ERNIE-Bot-4.0,61.15,61.15,70,70,60,60,70,70,70
8
- 6,✨ qwen1.5-14b-chat,54.9,56.44,64.09,67.1,52.23,53.52,59.54,64.18,67.1
9
- 7,✨ qwen1.5-14b-base,34.88,34.88,60.82,60.82,65.55,65.55,47.08,47.08,65.55
10
- 8,✨ DevOps-Model-14B-Chat,30.69,30.59,55.77,63.63,63.85,61.96,41.15,44.01,63.85
11
- 9,✨ Qwen-14B-Chat,43.78,47.81,56.58,59.4,62.09,59.7,49.06,55.88,62.09
12
- 10,✨ LLaMA-2-13B,41.8,46.5,53.1,58.7,53.3,53,56.8,61,61
13
- 11,✨ InternLM2-Chat-20B,56.36,56.36,26.18,26.18,60.48,60.48,45.1,45.1,60.48
14
- 12,✨ LLaMA-2-70B-Chat,25.29,25.29,57.97,58.06,52.97,52.97,58.55,58.55,58.55
15
- 13,✨ InternLM2-Chat-7B,49.74,49.74,56.19,56.19,48.2,48.2,49.74,49.74,56.19
16
- 14,✨ LLaMA-2-7B,39.5,40,45.4,49.5,48.2,46.8,52,55.2,55.2
17
- 15,✨ Qwen-7B-Chat,45.9,46,47.3,50.1,52.1,51,48.3,49.8,52.1
18
- 16,✨ gemma_7b,25.09,25.09,50.86,50.86,30.24,30.24,51.56,51.56,51.56
19
- 17,✨ InternLM-7B,38.7,38.7,43.9,43.9,45.2,45.2,51.4,51.4,51.4
20
- 18,✨ Chinese-Alpaca-2-13B,37.7,37.7,49.7,49.7,48.6,48.6,50.5,50.5,50.5
21
- 19,✨ Mistral-7B,29.27,29.27,46.3,46.3,47.22,47.22,45.58,45.58,47.22
22
- 20,✨ AquilaChat2-34B,36.63,36.63,44.83,44.83,46.65,46.65,NULL,NULL,46.65
23
- 21,✨ ChatGLM3-6B,43.38,43.38,44.59,44.59,42.1,42.1,43.47,43.47,44.59
24
- 22,✨ ChatGLM2-6B,24.8,24.7,36.6,36.5,37.6,37.6,40.5,40.5,40.5
25
- 23,✨ Chinese-LLaMA-2-13B,29.4,29.4,37.8,37.8,40.4,40.4,28.8,28.8,40.4
26
- 24,✨ gemma_2b,26.46,26.46,33.42,33.42,26.63,26.63,37.54,37.54,37.54
27
- 25,✨ Baichuan-13B-Chat,18.3,20.4,28.6,37,24.1,26.7,18.2,17.8,37
28
- 26,✨ Baichuan2-13B-Chat,14.1,15.3,24.1,25.8,32.3,33.1,25.6,27.7,33.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard/wired_network_zh.csv DELETED
@@ -1,31 +0,0 @@
1
- -,-,Zero-shot,-,v,-,3-Shot,-,-,-,-
2
- -,-,Naïve,SC,CoT,CoT+SC,Naïve,SC,CoT,CoT+SC,Best Score
3
- 1,✨ GPT-4,/,/,/,/,/,/,86,86,86
4
- 2,✨ ERNIE-Bot-4.0,67.54,67.54,71.96,71.96,72,72,78,78,78
5
- 3,✨ Yi-34B-Chat,61.61,62.56,68.11,69.75,65.73,65.37,69.88,71.21,71.21
6
- 4,✨ Qwen-72B-Chat,65.77,65.86,68.13,68.3,69.4,69.4,69.99,70.08,70.08
7
- 5,✨ Hunyuan-13B,60,60,70,70,,,,,70
8
- 6,✨ GPT-3.5-turbo,58.4,58.6,64.8,67.6,59.2,59.7,65.2,67.4,67.6
9
- 7,✨ GLM4,67.38,67.38,,,,,,,67.38
10
- 8,✨ qwen1.5-14b-chat,54.04,53.87,62.56,63.86,58.78,58.09,63.43,65.58,65.58
11
- 9,✨ DevOps-Model-14B-Chat,47.59,46.57,52.52,56.01,62.07,60.08,50.59,55.79,62.07
12
- 10,✨ qwen1.5-14b-base,45.18,45.18,59.12,59.12,61.1,61.1,52.5,52.5,61.1
13
- 11,✨ InternLM2-Chat-7B,54.3,54.3,59.81,59.81,58.52,58.52,51.64,51.64,59.81
14
- 12,✨ GLM3-turbo,59.64,59.64,,,,,,,59.64
15
- 13,✨ InternLM2-Chat-20B,57.49,57.49,57.14,57.14,59.12,59.12,50.77,50.77,59.12
16
- 14,✨ Qwen-14B-Chat,48.35,48.81,55.35,57.4,58.53,56.12,52.12,54.99,58.53
17
- 15,✨ LLaMA-2-70B-Chat,38.55,38.55,57.49,57.49,49.09,49.09,48.57,48.57,57.49
18
- 16,✨ LLaMA-2-13B,29.7,31.6,51.6,57,39.6,38.9,48,50.6,57
19
- 17,✨ Baichuan-13B-Chat,15.2,16,43.9,49.7,34.3,36.1,51.3,55.6,55.6
20
- 18,✨ LLaMA-2-7B,29.8,30.2,50.1,55.6,38.6,40.8,45.6,50.4,55.6
21
- 19,✨ Qwen-7B-Chat,29.6,29.9,50.6,53.5,50.4,46.9,46.9,47.7,53.5
22
- 20,✨ ChatGLM3-6B,41.39,41.39,49.23,49.23,38.81,38.81,42.86,42.86,49.23
23
- 21,✨ gemma_7b,31.58,31.58,47.59,47.59,34.68,34.68,48.88,48.88,48.88
24
- 22,✨ AquilaChat2-34B,34.66,34.66,47.74,47.74,44.48,44.48,NULL,NULL,47.74
25
- 23,✨ Mistral-7B,1.9,1.9,45.61,45.61,15,15,35.97,35.97,45.61
26
- 24,✨ Chinese-Alpaca-2-13B,33.1,33.1,44.2,44.2,44,44,42.7,42.7,44.2
27
- 25,✨ InternLM-7B,41.7,41.7,38.4,38.4,42.6,42.6,41.3,41.3,42.6
28
- 26,✨ ChatGLM2-6B,33.8,33.7,42.1,42.2,36,36,39.5,39.5,42.2
29
- 27,✨ Chinese-LLaMA-2-13B,22.5,22.5,38.8,38.8,41.8,41.8,32.2,32.2,41.8
30
- 28,✨ gemma_2b,29.69,29.69,39.16,39.16,29.78,29.78,38.64,38.64,39.16
31
- 29,✨ Baichuan2-13B-Chat,35.6,35.9,30.5,30.5,34.6,35.6,30.2,32,35.9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboards.py CHANGED
@@ -3,6 +3,7 @@ eng_leaderboards = [
3
  ('lenovo', ['mc']),
4
  ('oracle', ['mc']),
5
  ('network', ['mc', 'qa']),
 
6
  ]
7
 
8
  chi_leaderboards = [
@@ -11,12 +12,13 @@ chi_leaderboards = [
11
  ('zjyd', ['mc', 'qa']),
12
  ('network', ['mc', 'qa']),
13
  ('pufa', ['mc']),
14
- ('zabbix', ['mc']),
15
  ('dfcdata', ['mc']),
16
  ('zte', ['mc']),
17
  ('oracle', ['mc']),
18
  ('tencent', ['qa']),
19
- ('bosc', ['mc']),
20
- ('rzy', ['mc']),
21
  ('lenovo', ['mc']),
 
22
  ]
 
3
  ('lenovo', ['mc']),
4
  ('oracle', ['mc']),
5
  ('network', ['mc', 'qa']),
6
+ ('owl', ['qa']),
7
  ]
8
 
9
  chi_leaderboards = [
 
12
  ('zjyd', ['mc', 'qa']),
13
  ('network', ['mc', 'qa']),
14
  ('pufa', ['mc']),
15
+ ('zabbix', ['mc', 'qa']),
16
  ('dfcdata', ['mc']),
17
  ('zte', ['mc']),
18
  ('oracle', ['mc']),
19
  ('tencent', ['qa']),
20
+ ('bosc', ['mc', 'qa']),
21
+ ('rzy', ['mc', 'qa']),
22
  ('lenovo', ['mc']),
23
+ ('owl', ['qa']),
24
  ]
opseval_datasets.py CHANGED
@@ -11,7 +11,8 @@ datasets_abbr = [
11
  'tencent',
12
  'bosc',
13
  'rzy',
14
- 'lenovo'
 
15
  ]
16
 
17
  datasets_zh = [
@@ -27,7 +28,8 @@ datasets_zh = [
27
  'DevOps能力评测(腾讯)',
28
  '金融信创系统运维(上海银行)',
29
  '日志分析能力评测(日志易)',
30
- '混合云建设与运维(联想集团)'
 
31
  ]
32
 
33
  datasets_en = [
@@ -43,7 +45,8 @@ datasets_en = [
43
  "DevOps Capability",
44
  "Financial New Generation System",
45
  "Log Analysis",
46
- "Hybrid Cloud Construction and Operations"
 
47
  ]
48
 
49
  dataset_abbr_zh_dict = {
 
11
  'tencent',
12
  'bosc',
13
  'rzy',
14
+ 'lenovo',
15
+ 'owl',
16
  ]
17
 
18
  datasets_zh = [
 
28
  'DevOps能力评测(腾讯)',
29
  '金融信创系统运维(上海银行)',
30
  '日志分析能力评测(日志易)',
31
+ '混合云建设与运维(联想集团)',
32
+ 'OWL',
33
  ]
34
 
35
  datasets_en = [
 
45
  "DevOps Capability",
46
  "Financial New Generation System",
47
  "Log Analysis",
48
+ "Hybrid Cloud Construction and Operations",
49
+ "OWL",
50
  ]
51
 
52
  dataset_abbr_zh_dict = {