Zhiyu Wu commited on
Commit
f0128b6
1 Parent(s): 327a44b

Add llama2, sort ShareGPT dataset by length (#18)

Browse files
README.md CHANGED
@@ -52,6 +52,6 @@ We run benchmarks using multiple nodes and GPUs using [Pegasus](https://github.c
52
  You can still run benchmarks without Pegasus like this:
53
 
54
  ```console
55
- $ docker exec leaderboard0 python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
56
- $ docker exec leaderboard0 python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
57
  ```
 
52
  You can still run benchmarks without Pegasus like this:
53
 
54
  ```console
55
+ $ docker exec leaderboard0 python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
56
+ $ docker exec leaderboard0 python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
57
  ```
data/A40_chat-concise_benchmark.csv CHANGED
@@ -19,3 +19,5 @@ metaai/llama-7B,25.80475014752762,63.463734049697784,2.2525196486312047,539.0479
19
  Neutralzz/BiLLa-7B-SFT,29.382300021941255,141.6155137676293,4.84122748247456,1131.9990564138398
20
  openaccess-ai-collective/manticore-13b-chat-pyg,17.220798012743607,268.91269308260576,15.692034786355059,4051.8244570182064
21
  FreedomIntelligence/phoenix-inst-chat-7b,32.33242374435414,229.95869711215582,6.910495058340042,2049.7076356614534
 
 
 
19
  Neutralzz/BiLLa-7B-SFT,29.382300021941255,141.6155137676293,4.84122748247456,1131.9990564138398
20
  openaccess-ai-collective/manticore-13b-chat-pyg,17.220798012743607,268.91269308260576,15.692034786355059,4051.8244570182064
21
  FreedomIntelligence/phoenix-inst-chat-7b,32.33242374435414,229.95869711215582,6.910495058340042,2049.7076356614534
22
+ metaai/Llama-2-13b-chat-hf,16.934647828854768,358.7941571524513,20.990738735323337,3942.400414707617
23
+ metaai/Llama-2-7b-chat-hf,31.733044836542074,402.6699126930826,12.569092892522697,2398.9215396235386
data/A40_chat_benchmark.csv CHANGED
@@ -19,3 +19,5 @@ BAIR/koala-7b,29.723806931945834,260.7196104768301,8.720630589929986,2017.329562
19
  BAIR/koala-13b,17.451436035057224,262.5295500335796,15.030911340299886,3827.6102800537265
20
  StabilityAI/stablelm-tuned-alpha-7b,26.413142361637988,255.34687709872398,9.454673889303727,2319.91146675621
21
  togethercomputer/RedPajama-INCITE-7B-Chat,21.410571862447824,279.5094022834117,12.506414288534286,2541.441298522497
 
 
 
19
  BAIR/koala-13b,17.451436035057224,262.5295500335796,15.030911340299886,3827.6102800537265
20
  StabilityAI/stablelm-tuned-alpha-7b,26.413142361637988,255.34687709872398,9.454673889303727,2319.91146675621
21
  togethercomputer/RedPajama-INCITE-7B-Chat,21.410571862447824,279.5094022834117,12.506414288534286,2541.441298522497
22
+ metaai/Llama-2-13b-chat-hf,16.95804416983929,384.7333781061115,22.55271715111622,4337.670243116255
23
+ metaai/Llama-2-7b-chat-hf,31.922994116700572,428.19341840161184,13.367807321468502,2556.7166067830576
data/A40_instruct-concise_benchmark.csv CHANGED
@@ -19,3 +19,5 @@ Neutralzz/BiLLa-7B-SFT,29.118626503392385,104.97817327065144,3.5443721553023035,
19
  nomic-ai/gpt4all-13b-snoozy,17.423064750595767,135.3938885157824,7.734149922101941,1871.6546057756862
20
  project-baize/baize-v2-7B,28.13796712305154,262.9902619207522,9.250474432119292,2105.324460711873
21
  lmsys/fastchat-t5-3b-v1.0,40.20822673632634,281.74110141034254,10.492163513616964,1110.3276249158694
 
 
 
19
  nomic-ai/gpt4all-13b-snoozy,17.423064750595767,135.3938885157824,7.734149922101941,1871.6546057756862
20
  project-baize/baize-v2-7B,28.13796712305154,262.9902619207522,9.250474432119292,2105.324460711873
21
  lmsys/fastchat-t5-3b-v1.0,40.20822673632634,281.74110141034254,10.492163513616964,1110.3276249158694
22
+ metaai/Llama-2-13b-chat-hf,16.753336372767794,223.39019476158495,12.93183804940574,2423.302869711249
23
+ metaai/Llama-2-7b-chat-hf,30.95799874634315,220.83680322364003,6.815573463441101,1288.2125369376631
data/A40_instruct_benchmark.csv CHANGED
@@ -19,3 +19,5 @@ lmsys/fastchat-t5-3b-v1.0,31.014371537480102,357.13734049697786,17.9643423938542
19
  nomic-ai/gpt4all-13b-snoozy,17.558360268154225,232.67461383478846,13.290953806575821,3411.2449123573792
20
  BAIR/koala-13b,17.468010116614902,254.08529214237743,14.4913390549458,3858.416870718604
21
  metaai/llama-7B,26.40244189851013,104.19308260577569,3.608983782098236,864.4181752854275
 
 
 
19
  nomic-ai/gpt4all-13b-snoozy,17.558360268154225,232.67461383478846,13.290953806575821,3411.2449123573792
20
  BAIR/koala-13b,17.468010116614902,254.08529214237743,14.4913390549458,3858.416870718604
21
  metaai/llama-7B,26.40244189851013,104.19308260577569,3.608983782098236,864.4181752854275
22
+ metaai/Llama-2-13b-chat-hf,16.999960399598052,371.56312961719277,21.688517364074986,4210.194823371436
23
+ metaai/Llama-2-7b-chat-hf,31.815139493955602,365.40362659503023,11.316028104293823,2180.2478049026786
data/score.csv CHANGED
@@ -18,3 +18,5 @@ FreedomIntelligence/phoenix-inst-chat-7b,44.965870307167236,63.2244572794264,47.
18
  camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
19
  Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
20
  togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241
 
 
 
18
  camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
19
  Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
20
  togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241
21
+ metaai/Llama-2-7b-chat-hf,52.73037542662116,78.48038239394542,45.32519554457334
22
+ metaai/Llama-2-13b-chat-hf,59.129692832764505,81.94582752439753,43.9572591900371
pegasus/benchmark.yaml CHANGED
@@ -3,7 +3,7 @@
3
  # {{ gpu }} is defined in `hosts.yaml`, and will be filled in when Pegasus
4
  # determines the specific node and gpu the generated job command will run on.
5
  - command:
6
- - docker exec leaderboard{{ gpu }} python scripts/benchmark.py --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --model-path {{ model }} --task {{ task }}
7
  model:
8
  - /data/leaderboard/weights/metaai/llama-7B
9
  - /data/leaderboard/weights/metaai/llama-13B
 
3
  # {{ gpu }} is defined in `hosts.yaml`, and will be filled in when Pegasus
4
  # determines the specific node and gpu the generated job command will run on.
5
  - command:
6
+ - docker exec leaderboard{{ gpu }} python scripts/benchmark.py --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json --model-path {{ model }} --task {{ task }}
7
  model:
8
  - /data/leaderboard/weights/metaai/llama-7B
9
  - /data/leaderboard/weights/metaai/llama-13B
requirements-benchmark.txt CHANGED
@@ -1,5 +1,5 @@
1
  zeus-ml==0.4.0
2
- fschat==0.2.14
3
  rwkv==0.7.5
4
  einops
5
  tyro
 
1
  zeus-ml==0.4.0
2
+ fschat==0.2.20
3
  rwkv==0.7.5
4
  einops
5
  tyro
scripts/benchmark.py CHANGED
@@ -197,7 +197,7 @@ def generate_stream(
197
  if not any(partially_stopped):
198
  # indicates which request in batch stopped
199
  different_indices = np.where(stopped != old_stopped)[0]
200
- stop_length = np.array([(i, len(output[i])) for i in different_indices])
201
  yield {
202
  "text": output,
203
  "stop_length": stop_length,
@@ -215,7 +215,7 @@ def generate_stream(
215
  spaces_between_special_tokens=False,
216
  clean_up_tokenization_spaces=True,
217
  )
218
- stop_length = np.array([(i, len(output[i])) for i in false_indices])
219
 
220
  yield {
221
  "text": output,
@@ -230,7 +230,7 @@ def generate_stream(
230
 
231
  def main(
232
  model_path: str,
233
- input_file: str = "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json",
234
  output_dir: str = "data",
235
  device_index: int = 0,
236
  task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
@@ -245,7 +245,7 @@ def main(
245
  Args:
246
  model_path: Path to or Huggingface Hub Id of the model.
247
  input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
248
- (Default: "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json")
249
  output_dir: Path to the output directory. (Default: "data")
250
  device_index: Index of the GPU to use for inference. (Default: 0)
251
  task: Type of task to perform inference on. (Default: "chat")
@@ -304,7 +304,12 @@ def main(
304
  conv_base = get_conversation_template(model_path)
305
 
306
  # Standardize the system prompt for every model.
307
- conv_base.system = SYSTEM_PROMPTS[task]
 
 
 
 
 
308
  conv_base.messages = []
309
  conv_base.offset = 0
310
 
@@ -407,7 +412,8 @@ def main(
407
  # Record numbers.
408
  output_text = output["text"]
409
  if not is_warmup:
410
- response_length = int(sum(batch_token_len.values())) # number of valid tokens
 
411
  latency = measurements.time
412
  throughput = response_length / latency
413
  energy = measurements.total_energy
 
197
  if not any(partially_stopped):
198
  # indicates which request in batch stopped
199
  different_indices = np.where(stopped != old_stopped)[0]
200
+ stop_length = np.array([(j, i+1) for j in different_indices])
201
  yield {
202
  "text": output,
203
  "stop_length": stop_length,
 
215
  spaces_between_special_tokens=False,
216
  clean_up_tokenization_spaces=True,
217
  )
218
+ stop_length = np.array([(i, max_new_tokens) for i in false_indices])
219
 
220
  yield {
221
  "text": output,
 
230
 
231
  def main(
232
  model_path: str,
233
+ input_file: str = "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json",
234
  output_dir: str = "data",
235
  device_index: int = 0,
236
  task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
 
245
  Args:
246
  model_path: Path to or Huggingface Hub Id of the model.
247
  input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
248
+ (Default: "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json")
249
  output_dir: Path to the output directory. (Default: "data")
250
  device_index: Index of the GPU to use for inference. (Default: 0)
251
  task: Type of task to perform inference on. (Default: "chat")
 
304
  conv_base = get_conversation_template(model_path)
305
 
306
  # Standardize the system prompt for every model.
307
+ if "llama-2" in model_path.lower():
308
+ conv_base.system = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPTS[task]}\n<</SYS>>\n\n"
309
+ elif "stablelm" in model_path.lower():
310
+ conv_base.system = f"""<|SYSTEM|># {SYSTEM_PROMPTS[task]}\n"""
311
+ else:
312
+ conv_base.system = SYSTEM_PROMPTS[task]
313
  conv_base.messages = []
314
  conv_base.offset = 0
315
 
 
412
  # Record numbers.
413
  output_text = output["text"]
414
  if not is_warmup:
415
+ total_length = int(sum(batch_token_len.values())) # number of valid tokens
416
+ response_length = float(total_length) / len(convs)
417
  latency = measurements.time
418
  throughput = response_length / latency
419
  energy = measurements.total_energy
sharegpt/README.md CHANGED
@@ -25,3 +25,8 @@ python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file
25
  ```
26
  python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
27
  ```
 
 
 
 
 
 
25
  ```
26
  python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
27
  ```
28
+
29
+ ## Sorted data
30
+ '''
31
+ python sort.py --data-dir sg_90k_part1_html_cleaned_lang_first_sampled.json --out-file sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
32
+ '''
sharegpt/{sg_90k_part1_html_cleaned_lang_first_sampled.json → sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json} RENAMED
The diff for this file is too large to render. See raw diff