Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

Jae-Won Chung commited on Jun 15, 2023

Commit

a679cf2

1 Parent(s): 8584e6d

Add benchmark.py, example data, README.md

Browse files

Files changed (7) hide show

.gitignore +0 -1
README.md +7 -0
inference.py → benchmark.py +130 -37
data/chat/lmsys--vicuna-7B/benchmark.json +38 -0
data/chat/lmsys--vicuna-7B/config.json +10 -0
sharegpt/README.md +8 -16
sharegpt/sharegpt_feeder.py +0 -15

.gitignore CHANGED Viewed

@@ -2,4 +2,3 @@
 .envrc
 pyrightconfig.json
 .idea
-.DS_Store

 .envrc
 pyrightconfig.json
 .idea

README.md CHANGED Viewed

@@ -20,3 +20,10 @@ Currently setup in `ampere02`:
 ```bash
 export TRANSFORMERS_CACHE=/data/leaderboard/hfcache
 ```

 ```bash
 export TRANSFORMERS_CACHE=/data/leaderboard/hfcache
 ```
+Run benchmarks like this:
+```console
+$ python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
+$ python benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
+```

inference.py → benchmark.py RENAMED Viewed

@@ -2,11 +2,16 @@
 from __future__ import annotations
-from typing import Literal
 import tyro
-import rich
 import torch
 from fastchat.serve.inference import generate_stream
 from fastchat.model.model_adapter import load_model, get_conversation_template
 from zeus.monitor import ZeusMonitor
@@ -35,7 +40,7 @@ SYSTEM_PROMPTS = {
 def main(
     model_path: str,
-    input_prompt: str,
     device_index: int = 0,
     task: Literal[tuple(SYSTEM_PROMPTS)] = "chat",  # type: ignore
     load_8bit: bool = False,
@@ -50,7 +55,7 @@ def main(
     Args:
         model_path: Path to or Huggingface Hub Id of the model.
-        input_prompt: Input prompt to use for inference.
         device_index: Index of the GPU to use for inference.
         task: Type of task to perform inference on.
         load_8bit: Whether to load the model in 8-bit mode.
@@ -64,6 +69,26 @@ def main(
     if "chatglm" in model_path.lower():
         raise ValueError("ChatGLM is not supported.")
     # Set the device.
     torch.cuda.set_device(f"cuda:{device_index}")
@@ -80,51 +105,119 @@ def main(
     )
     # Chats are accumulated in a conversation helper object.
-    conv = get_conversation_template(model_path)
     # Standardize the system prompt for every model.
-    conv.system = SYSTEM_PROMPTS[task]
-    conv.messages = []
-    conv.offset = 0
-    # Construct the input prompt.
-    conv.append_message(conv.roles[0], input_prompt)
-    conv.append_message(conv.roles[1], "")
-    prompt = conv.get_prompt()
-    # Generate the ouptut from the model.
     gen_params = {
         "model": model_path,
-        "prompt": prompt,
         "temperature": temperature,
         "repitition_penalty": repitition_penalty,
         "max_new_tokens": max_new_tokens,
-        "stop": conv.stop_str,
-        "stop_token_ids": conv.stop_token_ids,
         "echo": False,
     }
-    output_stream = generate_stream(model, tokenizer, gen_params, device="cuda")
-    output = {}
-    # Inference and measurement!
     monitor = ZeusMonitor(gpu_indices=[torch.cuda.current_device()])
-    monitor.begin_window("inference")
-    for output in output_stream:
-        pass
-    measurements = monitor.end_window("inference")
-    # Print the input and output.
-    rich.print(f"\n[u]Prompt[/u]:\n{prompt.strip()}\n")
-    output_text = output["text"]
-    rich.print(f"\n[u]Response[/u]:\n{output_text.strip()}\n")
-    # Print numbers.
-    num_tokens = len(tokenizer.encode(output_text))
-    rich.print(measurements)
-    rich.print(f"Number of tokens: {num_tokens}")
-    rich.print(f"Tokens per seconds: {num_tokens / measurements.time:.2f}")
-    rich.print(f"Joules per token: {measurements.total_energy / num_tokens:.2f}")
-    rich.print(f"Average power consumption: {measurements.total_energy / measurements.time:.2f}")
 if __name__ == "__main__":

 from __future__ import annotations
+import os
+import json
+import copy
+import atexit
+from typing import Generator, Literal
 import tyro
 import torch
+import rich
+from rich.table import Table
 from fastchat.serve.inference import generate_stream
 from fastchat.model.model_adapter import load_model, get_conversation_template
 from zeus.monitor import ZeusMonitor
 def main(
     model_path: str,
+    input_file: str,
     device_index: int = 0,
     task: Literal[tuple(SYSTEM_PROMPTS)] = "chat",  # type: ignore
     load_8bit: bool = False,
     Args:
         model_path: Path to or Huggingface Hub Id of the model.
+        input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
         device_index: Index of the GPU to use for inference.
         task: Type of task to perform inference on.
         load_8bit: Whether to load the model in 8-bit mode.
     if "chatglm" in model_path.lower():
         raise ValueError("ChatGLM is not supported.")
+    # Print out what we're about to do.
+    model_name_cleaned = "--".join(model_path.split("/")[-2:])
+    output_dir = f"data/{task}/{model_name_cleaned}"
+    output_csv_path = f"{output_dir}/benchmark.json"
+    config_json_path = f"{output_dir}/config.json"
+    table = Table(title="Benchmark")
+    table.add_column("Configuration")
+    table.add_column("Value")
+    table.add_row("Model", f"{model_name_cleaned} (path: {model_path})")
+    table.add_row("Input", input_file)
+    table.add_row("Device", f"cuda:{device_index}")
+    table.add_row("Task", task)
+    table.add_row("8-bit", str(load_8bit))
+    table.add_row("Temperature", f"{temperature:.2f}")
+    table.add_row("Repitition Penalty", f"{repitition_penalty:.2f}")
+    table.add_row("Max New Tokens", str(max_new_tokens))
+    table.add_row("Output CSV", output_csv_path)
+    table.add_row("Config JSON", config_json_path)
+    rich.get_console().print(table)
     # Set the device.
     torch.cuda.set_device(f"cuda:{device_index}")
     )
     # Chats are accumulated in a conversation helper object.
+    conv_base = get_conversation_template(model_path)
     # Standardize the system prompt for every model.
+    conv_base.system = SYSTEM_PROMPTS[task]
+    conv_base.messages = []
+    conv_base.offset = 0
     gen_params = {
         "model": model_path,
+        "prompt": "EMPTY",
         "temperature": temperature,
         "repitition_penalty": repitition_penalty,
         "max_new_tokens": max_new_tokens,
+        "stop": conv_base.stop_str,
+        "stop_token_ids": conv_base.stop_token_ids,
         "echo": False,
     }
     monitor = ZeusMonitor(gpu_indices=[torch.cuda.current_device()])
+    # Output files.
+    # Leave only the last two path components and replace slashes with double dashes.
+    os.makedirs(output_dir, exist_ok=True)
+    output_json = open(output_csv_path, "w")
+    output_json.write("[\n")
+    output_json.flush()
+    # Conclude the JSON file format with a closing bracket. Using `atexit` will
+    # handle all cases of the program exiting, including Ctrl-C and errors.
+    atexit.register(lambda: output_json.write("\n]\n"))
+    # Dump the configuration to a JSON file.
+    with open(config_json_path, "w") as config_json:
+        json.dump(
+            {
+                "model_path": model_path,
+                "input_file": input_file,
+                "device_index": device_index,
+                "task": task,
+                "load_8bit": load_8bit,
+                "temperature": temperature,
+                "repitition_penalty": repitition_penalty,
+                "max_new_tokens": max_new_tokens,
+            },
+            config_json,
+            indent=4,
+        )
+        config_json.write("\n")
+    def dataloader(input_file: str) -> Generator[tuple[bool, str], None, None]:
+        """Yields a tuple of whether this is a warmup run and the input prompt."""
+        for _ in range(3):
+            yield True, "Say something long and random. I don't care about the content."
+        for item in json.load(open(input_file, "r")):
+            input_prompt = item["conversations"][0]["value"]
+            yield False, input_prompt
+    # Warm up the GPU with some random prompts.
+    # Forward through all the prompts.
+    is_first = True
+    for is_warmup, input_prompt in dataloader(input_file):
+        # Construct the input prompt.
+        conv = copy.deepcopy(conv_base)
+        conv.append_message(conv.roles[0], input_prompt)
+        conv.append_message(conv.roles[1], "")
+        prompt = conv.get_prompt()
+        gen_params["prompt"] = prompt
+        # Print input prompt.
+        rich.print(f"\n[u]{'Warmup ' if is_warmup else ''}Prompt[/u]:\n{prompt.strip()}\n")
+        # Generate the ouptut from the model.
+        output_stream = generate_stream(model, tokenizer, gen_params, device="cuda")
+        output = {}
+        #################################################
+        # Inference and measurement zone!
+        #################################################
+        monitor.begin_window("inference")
+        for output in output_stream:
+            pass
+        measurements = monitor.end_window("inference")
+        #################################################
+        # Record numbers.
+        output_text = output["text"]
+        if not is_warmup:
+            response_length = len(tokenizer.encode(output_text))  # number of tokens
+            latency = measurements.time
+            throughput = response_length / latency
+            energy = measurements.total_energy
+            output = {
+                "model": model_name_cleaned,
+                "throughput": throughput,
+                "response_length": response_length,
+                "latency": latency,
+                "energy": energy,
+                "input": prompt.strip(),
+                "output": output_text.strip(),
+            }
+            output_str = json.dumps(output, indent=4)
+            if not is_warmup:
+                if not is_first:
+                    output_json.write(",\n" + output_str)
+                else:
+                    is_first = False
+                    output_json.write(output_str)
+            output_json.flush()
+        # Print the response.
+        rich.print(f"\n[u]{'Warmup ' if is_warmup else ''}Response[/u]:\n{output_text.strip()}\n")
+        # Print measurement.
+        rich.print(measurements)
 if __name__ == "__main__":

data/chat/lmsys--vicuna-7B/benchmark.json ADDED Viewed

	@@ -0,0 +1,38 @@

+[
+{
+    "model": "lmsys--vicuna-7B",
+    "throughput": 30.558443157709224,
+    "response_length": 331,
+    "latency": 10.831703640520573,
+    "energy": 2215.2789999842644,
+    "input": "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients... ASSISTANT:",
+    "output": "Sure, here are the main ideas of Jeff Walker's Product Launch Formula and how they can be applied to a growth marketing agency implementing these strategies and tactics for their clients:\n\n1. Pre-launch phase:\n* Build an audience through content marketing, social media, and other channels.\n* Create a sense of urgency and exclusivity by offering early access or special bonuses to those on a mailing list.\n* Use teasers, sneak peeks, and other forms of anticipation to build excitement.\n2. Launch phase:\n* Use a webinar or live event to launch the product, using stories, testimonials, and social proof to build trust and credibility.\n* Offer a limited-time special offer or bonus to those who purchase during the launch.\n* Continually communicate with the audience during the launch phase, answering questions, providing updates, and building relationships.\n3. Post-launch phase:\n* Follow up with customers to make sure they are satisfied and provide additional support and resources.\n* Use customer testimonials, success stories, and case studies to continue building social proof and credibility.\n* Continually update and improve the product based on customer feedback.\n\nOverall, the Product Launch Formula emphasizes the importance of building an engaged audience, creating urgency and exclusivity, and providing ongoing support and value to customers. By implementing these strategies and tactics, a growth marketing agency can help their clients launch successful products and grow their businesses."
+},
+{
+    "model": "lmsys--vicuna-7B",
+    "throughput": 31.514563598172494,
+    "response_length": 98,
+    "latency": 3.109673395752907,
+    "energy": 640.4210000038147,
+    "input": "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: How to tell if a customer segment is well segmented? In 3 bullet points. ASSISTANT:",
+    "output": "1. Homogeneity: The segment should consist of customers who share similar characteristics, behaviors, or needs.\n2. Distinctiveness: The segment should be different from other segments in terms of their characteristics, behaviors, or needs.\n3. Stability: The segment should remain relatively consistent over time and not change drastically. If the segment is constantly changing or becoming more similar to other segments, it may indicate that the segmentation is not effective."
+},
+{
+    "model": "lmsys--vicuna-7B",
+    "throughput": 30.362787676034074,
+    "response_length": 346,
+    "latency": 11.39552809484303,
+    "energy": 2334.6629999876022,
+    "input": "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: In Java, I want to replace string like \"This is a new {object} at {place}\" with a Map, {object: \"student\", \"point 3, 4\"}, and get a result \"This is a new student at point 3, 4\". How can I do? ASSISTANT:",
+    "output": "To replace the string with a map, you can use the `String.format()` method and pass in an object that contains the key-value pairs of the map. Here's an example of how you can do this in Java:\n```\nString originalString = \"This is a new {object} at {place}\";\nString formattedString = String.format(\"This is a new %s at %s\",\n                                         new ObjectMapper().writeValueAsString(map),\n                                         new ObjectMapper().writeValueAsString(place));\n```\nIn this example, `originalString` is the original string that you want to replace, `map` is the map that you want to use to replace the {object} and {place} placeholders, and `ObjectMapper` is a class that is used to convert the map to a string. The `writeValueAsString()` method is used to convert the map to a JSON string, which can then be passed to the `String.format()` method to replace the placeholders in the string.\n\nTo get the result \"This is a new student at point 3, 4\", you can use the following code:\n```\nString result = String.format(\"This is a new %s at %s\",\n                               new ObjectMapper().writeValueAsString(map),\n                               new ObjectMapper().writeValueAsString(place));\n```\nYou can then pass the `result` string to a `println` statement or use it in any other way that you like.\n\nI hope this helps! Let me know if you have any other questions."
+},
+{
+    "model": "lmsys--vicuna-7B",
+    "throughput": 31.208981333084907,
+    "response_length": 96,
+    "latency": 3.076037598773837,
+    "energy": 649.0310000181198,
+    "input": "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Metaphorical language is also used to describe the various addressing modes of the instructions. Grandiose language to express their excitement and admiration for the functionality of the instructions being described. Now, rewrite this with more perplexity:\n\nJMP ABCD\nMOV AX, [BX+SI]\nMOV AX, [100]\nMOV AX, [BX]\nMOV AX, [BX\\*2+SI]\nMOV AX, BX\nMOV AX, 7 ASSISTANT:",
+    "output": "The instructions described seem to possess a remarkable array of addressing modes, each with its own unique set of characteristics. The use of grandiose language to express excitement and admiration for these instructions is certainly understandable, as their functionality seems to transcend the mere execution of basic computations. With so many different addressing modes to choose from, one might be forgiven for feeling a sense of perplexity and wonder at the sheer complexity of these instructions."
+}
+]

data/chat/lmsys--vicuna-7B/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "model_path": "/data/leaderboard/weights/lmsys/vicuna-7B",
+    "input_file": "/data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json",
+    "device_index": 0,
+    "task": "chat",
+    "load_8bit": false,
+    "temperature": 0.7,
+    "repitition_penalty": 1.0,
+    "max_new_tokens": 512
+}

sharegpt/README.md CHANGED Viewed

@@ -1,5 +1,6 @@
-## Download ShareGPT :
 ```
 https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
@@ -8,30 +9,21 @@ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolv
 ## Install Fastchat
 ```
-pip3 install fastchat
 ```
 ## Clean data:
 ```
-pip3 install polyglot pyicu pycld2
-python3 -m fastchat.data.optional_clean --in sg_90k_part1_html_cleaned.json --out sg_90k_part1_html_cleaned_lang.json --keep-lang en
 ```
-## Extract first sentence (optional)
 ```
 python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file sg_90k_part1_html_cleaned_lang_first.json
 ```
-## Sample data (optional)
 ```
-python3 -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
 ```
-## ShareGPT Feeder Usage
-```
-from sharegpt_feeder import generator
-sharegpt_generator = generator()
-print(next(sharegpt_generator))
-print(next(sharegpt_generator))
-```

+# How we used ShareGPT to create our benchmark dataset
+## Download ShareGPT
 ```
 https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
 ## Install Fastchat
 ```
+pip install fastchat
 ```
 ## Clean data:
 ```
+pip install polyglot pyicu pycld2
+python -m fastchat.data.optional_clean --in sg_90k_part1_html_cleaned.json --out sg_90k_part1_html_cleaned_lang.json --keep-lang en
 ```
+## Extract first prompt
 ```
 python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file sg_90k_part1_html_cleaned_lang_first.json
 ```
+## Sample data
 ```
+python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
 ```

sharegpt/sharegpt_feeder.py DELETED Viewed

@@ -1,15 +0,0 @@
-''' Usage
-sharegpt_generator = sharegpt_generator()
-print(next(sharegpt_generator))
-print(next(sharegpt_generator))
-print(next(sharegpt_generator))
-'''
-import json
-def sharegpt_generator(file = 'sg_90k_part1_html_cleaned_lang.json'):
-    content = json.load(open(file, "r"))
-    for item in content:
-        yield item['conversations'][0]['value']