Jae-Won Chung commited on
Commit
e3571c1
1 Parent(s): 97b5f1c
Dockerfile CHANGED
@@ -11,7 +11,7 @@ RUN apt-get update -qq \
11
  && apt-get clean all \
12
  && rm -r /var/lib/apt/lists/*
13
 
14
- # Install Miniconda3 4.12.0
15
  ENV PATH="/root/.local/miniconda3/bin:$PATH"
16
  RUN mkdir -p /root/.local \
17
  && wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \
 
11
  && apt-get clean all \
12
  && rm -r /var/lib/apt/lists/*
13
 
14
+ # Install Miniconda3 23.3.1
15
  ENV PATH="/root/.local/miniconda3/bin:$PATH"
16
  RUN mkdir -p /root/.local \
17
  && wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \
README.md CHANGED
@@ -1,11 +1,12 @@
1
- # ML.ENERGY Leaderboard
 
 
2
 
3
  ## Devs
4
 
5
- Currently setup in `ampere02`:
6
 
7
  1. Find model weights in `/data/leaderboard/weights/`, e.g. subdirectory `llama` and `vicuna`.
8
-
9
  2. Let's share the Huggingface Transformer cache:
10
 
11
  ```bash
@@ -19,6 +20,6 @@ $ docker build -t leaderboard:latest .
19
  $ docker run -it --name jw-leaderboard --gpus all --cap-add SYS_ADMIN -v /data/leaderboard:/data/leaderboard -v $HOME/workspace/leaderboard:/workspace/leaderboard leaderboard:latest bash
20
 
21
  # cd leaderboard
22
- # python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
23
- # python benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
24
  ```
 
1
+ <h1><a href="https://ml.energy" style="color: #27cb63; text-decoration: none">ML.ENERGY</a> Leaderboard</h1>
2
+
3
+ How much energy do LLMs consume?
4
 
5
  ## Devs
6
 
7
+ Current setup in `ampere02`:
8
 
9
  1. Find model weights in `/data/leaderboard/weights/`, e.g. subdirectory `llama` and `vicuna`.
 
10
  2. Let's share the Huggingface Transformer cache:
11
 
12
  ```bash
 
20
  $ docker run -it --name jw-leaderboard --gpus all --cap-add SYS_ADMIN -v /data/leaderboard:/data/leaderboard -v $HOME/workspace/leaderboard:/workspace/leaderboard leaderboard:latest bash
21
 
22
  # cd leaderboard
23
+ # python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
24
+ # python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
25
  ```
extract.py DELETED
@@ -1,69 +0,0 @@
1
- import re
2
- import json
3
- import numpy as np
4
- import statistics
5
- import os
6
- import csv
7
-
8
- model = []
9
- throughput = []
10
- response_length = []
11
- latency = []
12
- energy = []
13
-
14
- temp_throughput = []
15
- temp_response_length = []
16
- temp_latency = []
17
- temp_energy = []
18
-
19
- model_name = os.listdir("data/chat")
20
-
21
- match_name = False
22
-
23
- for models in model_name:
24
- with open("data/chat/"+models+"/benchmark.json", 'r') as file:
25
- json_data = json.load(file)
26
-
27
- for obj in json_data:
28
- if not match_name:
29
- name = str(obj["model"])
30
- model.append(name.replace('--','/'))
31
- match_name = True
32
- temp_throughput.append(float(obj["throughput"]))
33
- temp_response_length.append(float(obj["response_length"]))
34
- temp_latency.append(float(obj["latency"]))
35
- temp_energy.append(float(obj["energy"]))
36
-
37
- match_name = False
38
-
39
- throughput.append(temp_throughput.copy())
40
- response_length.append(temp_response_length.copy())
41
- latency.append(temp_latency.copy())
42
- energy.append(temp_energy.copy())
43
-
44
- temp_throughput.clear()
45
- temp_response_length.clear()
46
- temp_latency.clear()
47
- temp_energy.clear()
48
-
49
-
50
- avg_throughput = [statistics.mean(row) for row in throughput]
51
- avg_response_length = [statistics.mean(row) for row in response_length]
52
- avg_latency = [statistics.mean(row) for row in latency]
53
- avg_energy = [statistics.mean(row) for row in energy]
54
-
55
- for i in range(len(model)):
56
- print(model[i])
57
- print(len(throughput[i]))
58
- print(len(response_length[i]))
59
- print(len(latency[i]))
60
- print(len(energy[i]))
61
-
62
- csv_file = "leaderboard.csv"
63
-
64
- with open(csv_file, "w", newline="") as file:
65
- writer = csv.writer(file)
66
- writer.writerow(["model","throughput","response_length","latency","energy"])
67
- for i in range(len(model)):
68
- writer.writerow([model[i], avg_throughput[i], avg_response_length[i], avg_latency[i], avg_energy[i]])
69
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_1.csv DELETED
@@ -1,5 +0,0 @@
1
- model,score,throughput,response_length,latency,energy
2
- lmsys/vicuna-7B,1000,30.08236985276053,283.0862995298858,9.431178230227955,2271.4826004029537
3
- lmsys/vicuna-13B,1000,17.509990378755237,281.76623376623377,16.124334009682688,4283.697810470779
4
- tatsu-lab/alpaca-7B,1000,30.09713731797294,125.20013431833445,4.129986896187982,916.045386501007
5
- metaai/llama-7B,1000,25.768609507174105,64.59032907991941,2.284814629996714,525.7081235728675
 
 
 
 
 
 
leaderboard_2.csv DELETED
@@ -1,5 +0,0 @@
1
- model,score,throughput,response_length,latency,energy
2
- metaai/llama-13B,1000,15.699146010424393,80.32236400268637,4.757332595030835,1293.689832437891
3
- camel-ai/CAMEL-13B-Combined-Data,1000,17.408929446926095,292.3656943839791,16.840487937994777,4481.158658249824
4
- BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth,1000,33.10830960148045,243.21793149764943,6.9481068778416555,1833.7241615177682
5
- databricks/dolly-v2-12b,1000,15.597444626791148,148.3270651443922,9.168758730287117,2362.087664204047
 
 
 
 
 
 
leaderboard_3.csv DELETED
@@ -1,5 +0,0 @@
1
- model,score,throughput,response_length,latency,energy
2
- FreedomIntelligence/phoenix-inst-chat-7b,1000,32.663340053939855,243.14909335124244,7.271332307256473,2149.2483156478947
3
- h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2,1000,28.851651162429675,216.66286098052385,7.544740398256815,1636.1981326393268
4
- lmsys/fastchat-t5-3b-v1.0,1000,17.78202422600336,313.22527472527474,23.570470748014376,2255.7007728936983
5
- Neutralzz/BiLLa-7B-SFT,1000,29.49201862368961,159.29986568166555,5.443799112468728,1218.644757555166
 
 
 
 
 
 
models.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /data/leaderboard/weights/metaai/llama-7B
2
+ /data/leaderboard/weights/metaai/llama-13B
3
+ /data/leaderboard/weights/lmsys/vicuna-7B
4
+ /data/leaderboard/weights/lmsys/vicuna-13B
5
+ /data/leaderboard/weights/tatsu-lab/alpaca-7B
6
+ /data/leaderboard/weights/BAIR/koala-7b
7
+ /data/leaderboard/weights/BAIR/koala-13b
8
+ /data/leaderboard/weights/BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth
9
+ camel-ai/CAMEL-13B-Combined-Data
10
+ databricks/dolly-v2-12b
11
+ FreedomIntelligence/phoenix-inst-chat-7b
12
+ h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2
13
+ lmsys/fastchat-t5-3b-v1.0
14
+ Neutralzz/BiLLa-7B-SFT
15
+ nomic-ai/gpt4all-13b-snoozy
16
+ openaccess-ai-collective/manticore-13b-chat-pyg
17
+ OpenAssistant/oasst-sft-1-pythia-12b
18
+ project-baize/baize-v2-7B
19
+ StabilityAI/stablelm-tuned-alpha-7b
20
+ togethercomputer/RedPajama-INCITE-7B-Chat
running_command.sh DELETED
@@ -1,27 +0,0 @@
1
- #!/bin/bash
2
-
3
- # node with four gpus
4
- python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
5
- python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
6
- python benchmark.py --model-path /data/leaderboard/weights/tatsu-lab/alpaca-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
7
- python benchmark.py --model-path /data/leaderboard/weights/metaai/llama-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
8
-
9
- python benchmark.py --model-path /data/leaderboard/weights/metaai/llama-13B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
10
- python benchmark.py --model-path camel-ai/CAMEL-13B-Combined-Data --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
11
- python benchmark.py --model-path /data/leaderboard/weights/BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
12
- python benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
13
-
14
- python benchmark.py --model-path FreedomIntelligence/phoenix-inst-chat-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
15
- python benchmark.py --model-path h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2 --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
16
- python benchmark.py --model-path lmsys/fastchat-t5-3b-v1.0 --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
17
- python benchmark.py --model-path Neutralzz/BiLLa-7B-SFT --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
18
-
19
- python benchmark.py --model-path nomic-ai/gpt4all-13b-snoozy --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
20
- python benchmark.py --model-path openaccess-ai-collective/manticore-13b-chat-pyg --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
21
- python benchmark.py --model-path OpenAssistant/oasst-sft-1-pythia-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
22
- python benchmark.py --model-path project-baize/baize-v2-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
23
-
24
- python benchmark.py --model-path /data/leaderboard/weights/BAIR/koala-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
25
- python benchmark.py --model-path /data/leaderboard/weights/BAIR/koala-13b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
26
- python benchmark.py --model-path StabilityAI/stablelm-tuned-alpha-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
27
- python benchmark.py --model-path togethercomputer/RedPajama-INCITE-7B-Chat --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark.py → scripts/benchmark.py RENAMED
@@ -40,7 +40,7 @@ SYSTEM_PROMPTS = {
40
 
41
  def main(
42
  model_path: str,
43
- input_file: str,
44
  output_dir: str = "data",
45
  device_index: int = 0,
46
  task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
@@ -54,6 +54,7 @@ def main(
54
  Args:
55
  model_path: Path to or Huggingface Hub Id of the model.
56
  input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
 
57
  output_dir: Path to the output directory. (Default: "data")
58
  device_index: Index of the GPU to use for inference. (Default: 0)
59
  task: Type of task to perform inference on. (Default: "chat")
 
40
 
41
  def main(
42
  model_path: str,
43
+ input_file: str = "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json",
44
  output_dir: str = "data",
45
  device_index: int = 0,
46
  task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
 
54
  Args:
55
  model_path: Path to or Huggingface Hub Id of the model.
56
  input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
57
+ (Default: "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json")
58
  output_dir: Path to the output directory. (Default: "data")
59
  device_index: Index of the GPU to use for inference. (Default: 0)
60
  task: Type of task to perform inference on. (Default: "chat")
scripts/compute_metrics.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+
4
+ import tyro
5
+ import pandas as pd
6
+
7
+
8
+ def main(data_dir: str, out_file: str) -> None:
9
+ """Compute metrics for all models in the given directory."""
10
+ model_names = os.listdir(data_dir)
11
+ print(f"{model_names=}")
12
+
13
+ out_csv = csv.writer(open(out_file, "w", newline=""))
14
+ metrics = ["throughput", "response_length", "latency", "energy"]
15
+ out_csv.writerow(["model"] + metrics)
16
+
17
+ for model_name in model_names:
18
+ df = pd.read_json(f"{data_dir}/{model_name}/benchmark.json")
19
+ out_csv.writerow(
20
+ [model_name.replace("--", "/")] + df[metrics].mean().to_list(),
21
+ )
22
+
23
+
24
+ if __name__ == "__main__":
25
+ tyro.cli(main)
sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json ADDED
The diff for this file is too large to render. See raw diff