Junetheriver commited on
Commit
45c0614
1 Parent(s): dc1251a

added wired network results

Browse files
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  __pycache__/
2
  flagged/
 
 
1
  __pycache__/
2
  flagged/
3
+ .DS_Store
README.md CHANGED
@@ -24,7 +24,7 @@ size_categories:
24
  - 1K<n<10K
25
  ---
26
 
27
- # OpsEval Dataset
28
 
29
  [Website](https://opseval.cstcloud.cn/content/home) | [Reporting Issues](https://github.com/NetManAIOps/OpsEval-Datasets/issues/new)
30
 
 
24
  - 1K<n<10K
25
  ---
26
 
27
+ # **🎉 🎉 OpsEval 👏 👏**
28
 
29
  [Website](https://opseval.cstcloud.cn/content/home) | [Reporting Issues](https://github.com/NetManAIOps/OpsEval-Datasets/issues/new)
30
 
app.py CHANGED
@@ -11,16 +11,8 @@ from apscheduler.schedulers.background import BackgroundScheduler
11
  from texts import INTRODUCTION_TEXT, TITLE
12
 
13
  df_lang = {
14
- 'en': pd.DataFrame({
15
- 'name': ['GPT-3', 'T5', 'BERT', 'RoBERTa', 'XLNet'],
16
- 'score': [0.75, 0.72, 0.68, 0.65, 0.62],
17
- 'rank': [1, 2, 3, 4, 5]
18
- }),
19
- 'zh': pd.DataFrame({
20
- 'name': ['GPT-3', 'T5', 'BERT', 'RoBERTa', 'XLNet'],
21
- 'score': [0.75, 0.72, 0.68, 0.65, 0.62],
22
- 'rank': [1, 2, 3, 4, 5]
23
- }),
24
  }
25
 
26
  def create_lang_leader_board(df):
 
11
  from texts import INTRODUCTION_TEXT, TITLE
12
 
13
  df_lang = {
14
+ 'English': pd.read_csv("./leaderboard/wired_network_en.csv"),
15
+ 'Chinese': pd.read_csv("./leaderboard/wired_network_zh.csv"),
 
 
 
 
 
 
 
 
16
  }
17
 
18
  def create_lang_leader_board(df):
leaderboard/wired_network_en.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -,-,Zero-shot,-,v,-,3-Shot,-,-,-,-
2
+ -,-,Naïve,SC,CoT,CoT+SC,Naïve,SC,CoT,CoT+SC,Best Score
3
+ 1,✨ GPT-4,/,/,/,/,/,/,88.7,88.7,88.7
4
+ 2,✨ Yi-34B-Chat,57.75,59.14,65.11,68.79,68.16,68.37,78.09,80.06,80.06
5
+ 3,✨ Qwen-72B-Chat,70.41,70.5,72.38,72.56,70.32,70.32,70.13,70.22,72.56
6
+ 4,✨ GPT-3.5-turbo,66.6,66.8,69.6,72,68.3,68.3,70.9,72.5,72.5
7
+ 5,✨ ERNIE-Bot-4.0,61.15,61.15,70,70,60,60,70,70,70
8
+ 6,✨ qwen1.5-14b-chat,54.9,56.44,64.09,67.1,52.23,53.52,59.54,64.18,67.1
9
+ 7,✨ qwen1.5-14b-base,34.88,34.88,60.82,60.82,65.55,65.55,47.08,47.08,65.55
10
+ 8,✨ DevOps-Model-14B-Chat,30.69,30.59,55.77,63.63,63.85,61.96,41.15,44.01,63.85
11
+ 9,✨ Qwen-14B-Chat,43.78,47.81,56.58,59.4,62.09,59.7,49.06,55.88,62.09
12
+ 10,✨ LLaMA-2-13B,41.8,46.5,53.1,58.7,53.3,53,56.8,61,61
13
+ 11,✨ InternLM2-Chat-20B,56.36,56.36,26.18,26.18,60.48,60.48,45.1,45.1,60.48
14
+ 12,✨ LLaMA-2-70B-Chat,25.29,25.29,57.97,58.06,52.97,52.97,58.55,58.55,58.55
15
+ 13,✨ InternLM2-Chat-7B,49.74,49.74,56.19,56.19,48.2,48.2,49.74,49.74,56.19
16
+ 14,✨ LLaMA-2-7B,39.5,40,45.4,49.5,48.2,46.8,52,55.2,55.2
17
+ 15,✨ Qwen-7B-Chat,45.9,46,47.3,50.1,52.1,51,48.3,49.8,52.1
18
+ 16,✨ gemma_7b,25.09,25.09,50.86,50.86,30.24,30.24,51.56,51.56,51.56
19
+ 17,✨ InternLM-7B,38.7,38.7,43.9,43.9,45.2,45.2,51.4,51.4,51.4
20
+ 18,✨ Chinese-Alpaca-2-13B,37.7,37.7,49.7,49.7,48.6,48.6,50.5,50.5,50.5
21
+ 19,✨ Mistral-7B,29.27,29.27,46.3,46.3,47.22,47.22,45.58,45.58,47.22
22
+ 20,✨ AquilaChat2-34B,36.63,36.63,44.83,44.83,46.65,46.65,NULL,NULL,46.65
23
+ 21,✨ ChatGLM3-6B,43.38,43.38,44.59,44.59,42.1,42.1,43.47,43.47,44.59
24
+ 22,✨ ChatGLM2-6B,24.8,24.7,36.6,36.5,37.6,37.6,40.5,40.5,40.5
25
+ 23,✨ Chinese-LLaMA-2-13B,29.4,29.4,37.8,37.8,40.4,40.4,28.8,28.8,40.4
26
+ 24,✨ gemma_2b,26.46,26.46,33.42,33.42,26.63,26.63,37.54,37.54,37.54
27
+ 25,✨ Baichuan-13B-Chat,18.3,20.4,28.6,37,24.1,26.7,18.2,17.8,37
28
+ 26,✨ Baichuan2-13B-Chat,14.1,15.3,24.1,25.8,32.3,33.1,25.6,27.7,33.1
leaderboard/wired_network_zh.csv ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -,-,Zero-shot,-,v,-,3-Shot,-,-,-,-
2
+ -,-,Naïve,SC,CoT,CoT+SC,Naïve,SC,CoT,CoT+SC,Best Score
3
+ 1,✨ GPT-4,/,/,/,/,/,/,86,86,86
4
+ 2,✨ ERNIE-Bot-4.0,67.54,67.54,71.96,71.96,72,72,78,78,78
5
+ 3,✨ Yi-34B-Chat,61.61,62.56,68.11,69.75,65.73,65.37,69.88,71.21,71.21
6
+ 4,✨ Qwen-72B-Chat,65.77,65.86,68.13,68.3,69.4,69.4,69.99,70.08,70.08
7
+ 5,✨ Hunyuan-13B,60,60,70,70,,,,,70
8
+ 6,✨ GPT-3.5-turbo,58.4,58.6,64.8,67.6,59.2,59.7,65.2,67.4,67.6
9
+ 7,✨ GLM4,67.38,67.38,,,,,,,67.38
10
+ 8,✨ qwen1.5-14b-chat,54.04,53.87,62.56,63.86,58.78,58.09,63.43,65.58,65.58
11
+ 9,✨ DevOps-Model-14B-Chat,47.59,46.57,52.52,56.01,62.07,60.08,50.59,55.79,62.07
12
+ 10,✨ qwen1.5-14b-base,45.18,45.18,59.12,59.12,61.1,61.1,52.5,52.5,61.1
13
+ 11,✨ InternLM2-Chat-7B,54.3,54.3,59.81,59.81,58.52,58.52,51.64,51.64,59.81
14
+ 12,✨ GLM3-turbo,59.64,59.64,,,,,,,59.64
15
+ 13,✨ InternLM2-Chat-20B,57.49,57.49,57.14,57.14,59.12,59.12,50.77,50.77,59.12
16
+ 14,✨ Qwen-14B-Chat,48.35,48.81,55.35,57.4,58.53,56.12,52.12,54.99,58.53
17
+ 15,✨ LLaMA-2-70B-Chat,38.55,38.55,57.49,57.49,49.09,49.09,48.57,48.57,57.49
18
+ 16,✨ LLaMA-2-13B,29.7,31.6,51.6,57,39.6,38.9,48,50.6,57
19
+ 17,✨ Baichuan-13B-Chat,15.2,16,43.9,49.7,34.3,36.1,51.3,55.6,55.6
20
+ 18,✨ LLaMA-2-7B,29.8,30.2,50.1,55.6,38.6,40.8,45.6,50.4,55.6
21
+ 19,✨ Qwen-7B-Chat,29.6,29.9,50.6,53.5,50.4,46.9,46.9,47.7,53.5
22
+ 20,✨ ChatGLM3-6B,41.39,41.39,49.23,49.23,38.81,38.81,42.86,42.86,49.23
23
+ 21,✨ gemma_7b,31.58,31.58,47.59,47.59,34.68,34.68,48.88,48.88,48.88
24
+ 22,✨ AquilaChat2-34B,34.66,34.66,47.74,47.74,44.48,44.48,NULL,NULL,47.74
25
+ 23,✨ Mistral-7B,1.9,1.9,45.61,45.61,15,15,35.97,35.97,45.61
26
+ 24,✨ Chinese-Alpaca-2-13B,33.1,33.1,44.2,44.2,44,44,42.7,42.7,44.2
27
+ 25,✨ InternLM-7B,41.7,41.7,38.4,38.4,42.6,42.6,41.3,41.3,42.6
28
+ 26,✨ ChatGLM2-6B,33.8,33.7,42.1,42.2,36,36,39.5,39.5,42.2
29
+ 27,✨ Chinese-LLaMA-2-13B,22.5,22.5,38.8,38.8,41.8,41.8,32.2,32.2,41.8
30
+ 28,✨ gemma_2b,29.69,29.69,39.16,39.16,29.78,29.78,38.64,38.64,39.16
31
+ 29,✨ Baichuan2-13B-Chat,35.6,35.9,30.5,30.5,34.6,35.6,30.2,32,35.9
texts.py CHANGED
@@ -1,7 +1,23 @@
1
- TITLE = '<h1 align="center" id="space-title">The OpsEval Leaderboard</h1>'
2
 
3
  INTRODUCTION_TEXT = '''
 
 
 
4
  The OpsEval dataset represents a pioneering effort in the evaluation of Artificial Intelligence for IT Operations (AIOps), focusing on the application of Large Language Models (LLMs) within this domain. In an era where IT operations are increasingly reliant on AI technologies for automation and efficiency, understanding the performance of LLMs in operational tasks becomes crucial. OpsEval offers a comprehensive task-oriented benchmark specifically designed for assessing LLMs in various crucial IT Ops scenarios.
5
 
6
  This dataset is motivated by the emerging trend of utilizing AI in automated IT operations, as predicted by Gartner, and the remarkable capabilities exhibited by LLMs in NLP-related tasks. OpsEval aims to bridge the gap in evaluating these models' performance in AIOps tasks, including root cause analysis of failures, generation of operations and maintenance scripts, and summarizing alert information.
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  '''
 
1
+ TITLE = '<h1 align="center" id="space-title">🎉 🎉 The OpsEval Leaderboard 👏 👏</h1>'
2
 
3
  INTRODUCTION_TEXT = '''
4
+
5
+ # 🚀 About OpsEval
6
+
7
  The OpsEval dataset represents a pioneering effort in the evaluation of Artificial Intelligence for IT Operations (AIOps), focusing on the application of Large Language Models (LLMs) within this domain. In an era where IT operations are increasingly reliant on AI technologies for automation and efficiency, understanding the performance of LLMs in operational tasks becomes crucial. OpsEval offers a comprehensive task-oriented benchmark specifically designed for assessing LLMs in various crucial IT Ops scenarios.
8
 
9
  This dataset is motivated by the emerging trend of utilizing AI in automated IT operations, as predicted by Gartner, and the remarkable capabilities exhibited by LLMs in NLP-related tasks. OpsEval aims to bridge the gap in evaluating these models' performance in AIOps tasks, including root cause analysis of failures, generation of operations and maintenance scripts, and summarizing alert information.
10
+
11
+ # 📃 Citation
12
+ ```
13
+ @misc{liu2023opseval,
14
+ title={OpsEval: A Comprehensive Task-Oriented AIOps Benchmark for Large Language Models},
15
+ author={Yuhe Liu and Changhua Pei and Longlong Xu and Bohan Chen and Mingze Sun and Zhirui Zhang and Yongqian Sun and Shenglin Zhang and Kun Wang and Haiming Zhang and Jianhui Li and Gaogang Xie and Xidao Wen and Xiaohui Nie and Dan Pei},
16
+ year={2023},
17
+ eprint={2310.07637},
18
+ archivePrefix={arXiv},
19
+ primaryClass={cs.AI}
20
+ }
21
+ ```
22
+
23
  '''