|
import os |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
from torch.profiler import ProfilerActivity, profile, record_function |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig |
|
from torch import nn |
|
import torch |
|
torch.set_float32_matmul_precision('high') |
|
import json |
|
from argparse import ArgumentParser |
|
|
|
def sample(outputs): |
|
next_token_logits = outputs.logits[:, -1, :] |
|
probs = nn.functional.softmax(next_token_logits, dim=-1) |
|
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) |
|
return next_tokens |
|
|
|
if __name__ == "__main__": |
|
parser = ArgumentParser() |
|
parser.add_argument("--device",default='cuda') |
|
parser.add_argument("--model",required=True) |
|
parser.add_argument("--use_cache",action='store_true') |
|
parser.add_argument("--max_new_tokens",type=int,default=16_000) |
|
parser.add_argument("--output_path") |
|
args = parser.parse_args() |
|
|
|
prompt = 'hello' |
|
|
|
config = AutoConfig.from_pretrained(args.model) |
|
config.max_position_embeddings = args.max_new_tokens+10 |
|
model = AutoModelForCausalLM.from_config(config) |
|
model.eval() |
|
model = model.to(args.device) |
|
model = torch.compile(model) |
|
model_size = sum(p.numel() for p in model.parameters()) |
|
tokenizer = AutoTokenizer.from_pretrained(args.model) |
|
tokenized_prompt = tokenizer(prompt, return_tensors="pt") |
|
tokenized_prompt = tokenized_prompt['input_ids'].to(args.device) |
|
|
|
model_input = { |
|
"input_ids":tokenized_prompt, |
|
"use_cache":args.use_cache, |
|
} |
|
|
|
cache_name = "state" if args.model.startswith("RWKV") else "past_key_values" |
|
model_input[cache_name]=None |
|
|
|
os.makedirs(os.path.dirname(args.output_path),exist_ok=True) |
|
writer = open(args.output_path,'w') |
|
for tok_idx in range(args.max_new_tokens): |
|
with torch.no_grad(): |
|
if args.use_cache and model_input[cache_name] is not None:model_input["input_ids"] = tokenized_prompt[:,-1:].to(args.device) |
|
else:model_input["input_ids"] = tokenized_prompt.to(args.device) |
|
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=False) as prof: |
|
with record_function("model_inference"): |
|
output = model.forward(**model_input) |
|
|
|
model_input[cache_name]=getattr(output,cache_name) |
|
next_tokens = sample(output) |
|
tokenized_prompt = torch.cat([tokenized_prompt.cpu(), next_tokens[:, None].cpu()], dim=-1) |
|
|
|
full_profile = next(event for event in prof.key_averages() if event.key == 'model_inference') |
|
writer.write(json.dumps({ |
|
"model_name": args.model, |
|
"model_size": model_size, |
|
"token_id": tok_idx, |
|
"strategy": args.device, |
|
"cpu_time": full_profile.cpu_time, |
|
"cuda_time": full_profile.cuda_time, |
|
"cpu_memory_usage": full_profile.cpu_memory_usage, |
|
"cuda_memory_usage": full_profile.cuda_memory_usage, |
|
"self_cpu_memory_usage": full_profile.self_cpu_memory_usage, |
|
"self_cuda_memory_usage": full_profile.self_cuda_memory_usage, |
|
"max_memory_allocated":torch.cuda.max_memory_allocated(), |
|
})+'\n' |
|
) |
|
torch.cuda.empty_cache() |
|
|
|
writer.close() |
|
|
|
""" |
|
python benchmark_inference_time.py --model RWKV/rwkv-4-3b-pile --use_cache --output_path data/inference_time/rwkv-3b.jsonl |
|
python benchmark_inference_time.py --model RWKV/rwkv-4-7b-pile --use_cache --output_path data/inference_time/rwkv-7b.jsonl |
|
python benchmark_inference_time.py --model RWKV/rwkv-4-14b-pile --use_cache --output_path data/inference_time/rwkv-14b.jsonl |
|
python benchmark_inference_time.py --model facebook/opt-2.7b --use_cache --output_path data/inference_time/opt-2.7b.jsonl |
|
python benchmark_inference_time.py --model facebook/opt-6.7b --use_cache --output_path data/inference_time/opt-6.7b.jsonl |
|
python benchmark_inference_time.py --model EleutherAI/pythia-2.8b --use_cache --output_path data/inference_time/pythia-2.8b.jsonl |
|
python benchmark_inference_time.py --model EleutherAI/pythia-6.9b --use_cache --output_path data/inference_time/pythia-6.9b.jsonl |
|
python benchmark_inference_time.py --model EleutherAI/gpt-neo-2.7B --use_cache --output_path data/inference_time/gpt-neo-2.7B.jsonl |
|
|
|
############# Poltting Code ############## |
|
import numpy as np |
|
import json |
|
def get_jsonl(f): return [json.loads(x) for x in open(f).readlines()] |
|
import matplotlib.pyplot as plt |
|
fig, (ax1,ax2,ax3) = plt.subplots(1, 3,figsize=(18, 4)) |
|
|
|
for model_name in [ |
|
"rwkv-3b", |
|
# "rwkv-7b", |
|
# "rwkv-14b", |
|
"opt-2.7b", |
|
"gpt-neo-2.7B", |
|
"pythia-2.8b" |
|
]: |
|
data = get_jsonl(f"data/inference_time/{model_name}.jsonl") |
|
cuda_time = [x['cuda_time'] for x in data] |
|
cumulative_time = np.cumsum(cuda_time)/(1000*1000) |
|
memory_usage = [x['max_memory_allocated']/(2**10)/(2**10)/(2**10) for x in data] |
|
ax1.plot([x/1000 for x in cuda_time][100:],label=model_name) |
|
ax2.plot(cumulative_time,label=model_name) |
|
ax3.plot(memory_usage,label=model_name) |
|
|
|
ax1.set_xlabel("# Tokens") |
|
ax1.set_ylabel("Time (ms) to generated the #-th token") |
|
ax1.grid() |
|
ax1.legend() |
|
ax1.set_title("Single Token Generation Latency") |
|
|
|
ax2.set_xlabel("# Tokens") |
|
ax2.set_ylabel("Cumulative time (s) to generated the #-th token") |
|
ax2.grid() |
|
ax2.legend() |
|
ax2.set_title("Cumulative Generation Latency") |
|
|
|
ax3.set_xlabel("# Tokens") |
|
ax3.set_ylabel("Memory usage (GB)") |
|
ax3.grid() |
|
ax3.legend() |
|
ax3.set_title("Memory usage in Generation") |
|
""" |