Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

Jae-Won Chung commited on Jun 18, 2023

Commit

e3571c1

1 Parent(s): 97b5f1c

Clean up

Browse files

Files changed (11) hide show

Dockerfile +1 -1
README.md +6 -5
extract.py +0 -69
leaderboard_1.csv +0 -5
leaderboard_2.csv +0 -5
leaderboard_3.csv +0 -5
models.txt +20 -0
running_command.sh +0 -27
benchmark.py → scripts/benchmark.py +2 -1
scripts/compute_metrics.py +25 -0
sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json +0 -0

Dockerfile CHANGED Viewed

@@ -11,7 +11,7 @@ RUN apt-get update -qq \
     && apt-get clean all \
     && rm -r /var/lib/apt/lists/*
-# Install Miniconda3 4.12.0
 ENV PATH="/root/.local/miniconda3/bin:$PATH"
 RUN mkdir -p /root/.local \
     && wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \

     && apt-get clean all \
     && rm -r /var/lib/apt/lists/*
+# Install Miniconda3 23.3.1
 ENV PATH="/root/.local/miniconda3/bin:$PATH"
 RUN mkdir -p /root/.local \
     && wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \

README.md CHANGED Viewed

@@ -1,11 +1,12 @@
-# ML.ENERGY Leaderboard
 ## Devs
-Currently setup in `ampere02`:
 1. Find model weights in `/data/leaderboard/weights/`, e.g. subdirectory `llama` and `vicuna`.
 2. Let's share the Huggingface Transformer cache:
 ```bash
@@ -19,6 +20,6 @@ $ docker build -t leaderboard:latest .
 $ docker run -it --name jw-leaderboard --gpus all --cap-add SYS_ADMIN -v /data/leaderboard:/data/leaderboard -v $HOME/workspace/leaderboard:/workspace/leaderboard leaderboard:latest bash
 # cd leaderboard
-# python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
-# python benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
 ```

+<h1><a href="https://ml.energy" style="color: #27cb63; text-decoration: none">ML.ENERGY</a> Leaderboard</h1>
+How much energy do LLMs consume?
 ## Devs
+Current setup in `ampere02`:
 1. Find model weights in `/data/leaderboard/weights/`, e.g. subdirectory `llama` and `vicuna`.
 2. Let's share the Huggingface Transformer cache:
 ```bash
 $ docker run -it --name jw-leaderboard --gpus all --cap-add SYS_ADMIN -v /data/leaderboard:/data/leaderboard -v $HOME/workspace/leaderboard:/workspace/leaderboard leaderboard:latest bash
 # cd leaderboard
+# python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
+# python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
 ```

extract.py DELETED Viewed

@@ -1,69 +0,0 @@
-import re
-import json
-import numpy as np
-import statistics
-import os
-import csv
-model = []
-throughput = []
-response_length = []
-latency = []
-energy = []
-temp_throughput = []
-temp_response_length = []
-temp_latency = []
-temp_energy = []
-model_name = os.listdir("data/chat")
-match_name = False
-for models in model_name:
-    with open("data/chat/"+models+"/benchmark.json", 'r') as file:
-        json_data = json.load(file)
-    for obj in json_data:
-        if not match_name:
-            name = str(obj["model"])
-            model.append(name.replace('--','/'))
-            match_name = True
-        temp_throughput.append(float(obj["throughput"]))
-        temp_response_length.append(float(obj["response_length"]))
-        temp_latency.append(float(obj["latency"]))
-        temp_energy.append(float(obj["energy"]))
-    match_name = False
-    throughput.append(temp_throughput.copy())
-    response_length.append(temp_response_length.copy())
-    latency.append(temp_latency.copy())
-    energy.append(temp_energy.copy())
-    temp_throughput.clear()
-    temp_response_length.clear()
-    temp_latency.clear()
-    temp_energy.clear()
-avg_throughput = [statistics.mean(row) for row in throughput]
-avg_response_length = [statistics.mean(row) for row in response_length]
-avg_latency = [statistics.mean(row) for row in latency]
-avg_energy = [statistics.mean(row) for row in energy]
-for i in range(len(model)):
-    print(model[i])
-    print(len(throughput[i]))
-    print(len(response_length[i]))
-    print(len(latency[i]))
-    print(len(energy[i]))
-csv_file = "leaderboard.csv"
-with open(csv_file, "w", newline="") as file:
-    writer = csv.writer(file)
-    writer.writerow(["model","throughput","response_length","latency","energy"])
-    for i in range(len(model)):
-        writer.writerow([model[i], avg_throughput[i], avg_response_length[i], avg_latency[i], avg_energy[i]])

leaderboard_1.csv DELETED Viewed

@@ -1,5 +0,0 @@
-model,score,throughput,response_length,latency,energy
-lmsys/vicuna-7B,1000,30.08236985276053,283.0862995298858,9.431178230227955,2271.4826004029537
-lmsys/vicuna-13B,1000,17.509990378755237,281.76623376623377,16.124334009682688,4283.697810470779
-tatsu-lab/alpaca-7B,1000,30.09713731797294,125.20013431833445,4.129986896187982,916.045386501007
-metaai/llama-7B,1000,25.768609507174105,64.59032907991941,2.284814629996714,525.7081235728675

leaderboard_2.csv DELETED Viewed

@@ -1,5 +0,0 @@
-model,score,throughput,response_length,latency,energy
-metaai/llama-13B,1000,15.699146010424393,80.32236400268637,4.757332595030835,1293.689832437891
-camel-ai/CAMEL-13B-Combined-Data,1000,17.408929446926095,292.3656943839791,16.840487937994777,4481.158658249824
-BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth,1000,33.10830960148045,243.21793149764943,6.9481068778416555,1833.7241615177682
-databricks/dolly-v2-12b,1000,15.597444626791148,148.3270651443922,9.168758730287117,2362.087664204047

leaderboard_3.csv DELETED Viewed

@@ -1,5 +0,0 @@
-model,score,throughput,response_length,latency,energy
-FreedomIntelligence/phoenix-inst-chat-7b,1000,32.663340053939855,243.14909335124244,7.271332307256473,2149.2483156478947
-h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2,1000,28.851651162429675,216.66286098052385,7.544740398256815,1636.1981326393268
-lmsys/fastchat-t5-3b-v1.0,1000,17.78202422600336,313.22527472527474,23.570470748014376,2255.7007728936983
-Neutralzz/BiLLa-7B-SFT,1000,29.49201862368961,159.29986568166555,5.443799112468728,1218.644757555166

models.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+/data/leaderboard/weights/metaai/llama-7B
+/data/leaderboard/weights/metaai/llama-13B
+/data/leaderboard/weights/lmsys/vicuna-7B
+/data/leaderboard/weights/lmsys/vicuna-13B
+/data/leaderboard/weights/tatsu-lab/alpaca-7B
+/data/leaderboard/weights/BAIR/koala-7b
+/data/leaderboard/weights/BAIR/koala-13b
+/data/leaderboard/weights/BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth
+camel-ai/CAMEL-13B-Combined-Data
+databricks/dolly-v2-12b
+FreedomIntelligence/phoenix-inst-chat-7b
+h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2
+lmsys/fastchat-t5-3b-v1.0
+Neutralzz/BiLLa-7B-SFT
+nomic-ai/gpt4all-13b-snoozy
+openaccess-ai-collective/manticore-13b-chat-pyg
+OpenAssistant/oasst-sft-1-pythia-12b
+project-baize/baize-v2-7B
+StabilityAI/stablelm-tuned-alpha-7b
+togethercomputer/RedPajama-INCITE-7B-Chat

running_command.sh DELETED Viewed

@@ -1,27 +0,0 @@
-#!/bin/bash
-# node with four gpus
-python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
-python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
-python benchmark.py --model-path /data/leaderboard/weights/tatsu-lab/alpaca-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
-python benchmark.py --model-path /data/leaderboard/weights/metaai/llama-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
-python benchmark.py --model-path /data/leaderboard/weights/metaai/llama-13B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
-python benchmark.py --model-path camel-ai/CAMEL-13B-Combined-Data --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
-python benchmark.py --model-path /data/leaderboard/weights/BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
-python benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
-python benchmark.py --model-path FreedomIntelligence/phoenix-inst-chat-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
-python benchmark.py --model-path h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2 --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
-python benchmark.py --model-path lmsys/fastchat-t5-3b-v1.0 --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
-python benchmark.py --model-path Neutralzz/BiLLa-7B-SFT --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
-python benchmark.py --model-path nomic-ai/gpt4all-13b-snoozy --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
-python benchmark.py --model-path openaccess-ai-collective/manticore-13b-chat-pyg --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
-python benchmark.py --model-path OpenAssistant/oasst-sft-1-pythia-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
-python benchmark.py --model-path project-baize/baize-v2-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
-python benchmark.py --model-path /data/leaderboard/weights/BAIR/koala-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
-python benchmark.py --model-path /data/leaderboard/weights/BAIR/koala-13b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
-python benchmark.py --model-path StabilityAI/stablelm-tuned-alpha-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
-python benchmark.py --model-path togethercomputer/RedPajama-INCITE-7B-Chat --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3

benchmark.py → scripts/benchmark.py RENAMED Viewed

@@ -40,7 +40,7 @@ SYSTEM_PROMPTS = {
 def main(
     model_path: str,
-    input_file: str,
     output_dir: str = "data",
     device_index: int = 0,
     task: Literal[tuple(SYSTEM_PROMPTS)] = "chat",  # type: ignore
@@ -54,6 +54,7 @@ def main(
     Args:
         model_path: Path to or Huggingface Hub Id of the model.
         input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
         output_dir: Path to the output directory. (Default: "data")
         device_index: Index of the GPU to use for inference. (Default: 0)
         task: Type of task to perform inference on. (Default: "chat")

 def main(
     model_path: str,
+    input_file: str = "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json",
     output_dir: str = "data",
     device_index: int = 0,
     task: Literal[tuple(SYSTEM_PROMPTS)] = "chat",  # type: ignore
     Args:
         model_path: Path to or Huggingface Hub Id of the model.
         input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
+            (Default: "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json")
         output_dir: Path to the output directory. (Default: "data")
         device_index: Index of the GPU to use for inference. (Default: 0)
         task: Type of task to perform inference on. (Default: "chat")

scripts/compute_metrics.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import csv
+import tyro
+import pandas as pd
+def main(data_dir: str, out_file: str) -> None:
+    """Compute metrics for all models in the given directory."""
+    model_names = os.listdir(data_dir)
+    print(f"{model_names=}")
+    out_csv = csv.writer(open(out_file, "w", newline=""))
+    metrics = ["throughput", "response_length", "latency", "energy"]
+    out_csv.writerow(["model"] + metrics)
+    for model_name in model_names:
+        df = pd.read_json(f"{data_dir}/{model_name}/benchmark.json")
+        out_csv.writerow(
+            [model_name.replace("--", "/")] + df[metrics].mean().to_list(),
+        )
+if __name__ == "__main__":
+    tyro.cli(main)

sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json ADDED Viewed

The diff for this file is too large to render. See raw diff