| import transformers | |
| import torch | |
| from modelscope import snapshot_download | |
| model_id = snapshot_download("LLM-Research/Llama-3.3-70B-Instruct") | |
| pipeline = transformers.pipeline( | |
| "text-generation", | |
| model=model_id, | |
| model_kwargs={"torch_dtype": torch.bfloat16}, | |
| device_map="auto", | |
| ) | |
| messages = [ | |
| {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"}, | |
| {"role": "user", "content": "Who are you?"}, | |
| ] | |
| with torch.profiler.profile( | |
| activities=[ | |
| torch.profiler.ProfilerActivity.CPU, | |
| # torch.profiler.ProfilerActivity.CUDA, # 捕捉 aten function 的调用仅开 CPU 就够了 | |
| ], | |
| #record_shapes=True, | |
| #with_stack=True, | |
| ) as p: | |
| outputs = pipeline( | |
| messages, | |
| max_new_tokens=256, | |
| ) | |
| print(outputs[0]["generated_text"][-1]) | |
| table_str = p.key_averages().table( | |
| sort_by="count", | |
| row_limit=-1, | |
| max_src_column_width=100, | |
| max_name_column_width=100, # 限制列宽 | |
| ) | |
| with open("Llama-3.3-70B-Instruct.txt", 'wt') as f: | |
| f.write(table_str) | |