File size: 2,582 Bytes
96fe658 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import os
from typing import Literal
import torch
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
def _prepare(infer_backend: Literal['vllm', 'pt', 'lmdeploy']):
from swift.llm import InferRequest, get_template
if infer_backend == 'lmdeploy':
from swift.llm import LmdeployEngine
engine = LmdeployEngine('Qwen/Qwen-VL-Chat', torch.float32)
elif infer_backend == 'pt':
from swift.llm import PtEngine
engine = PtEngine('Qwen/Qwen2-VL-7B-Instruct')
elif infer_backend == 'vllm':
from swift.llm import VllmEngine
engine = VllmEngine('Qwen/Qwen2-VL-7B-Instruct')
template = get_template(engine.model_meta.template, engine.processor)
infer_requests = [
InferRequest([{
'role': 'user',
'content': '晚上睡不着觉怎么办'
}]),
InferRequest([{
'role':
'user',
'content': [{
'type': 'image_url',
'image_url': 'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'
}]
}])
]
return engine, template, infer_requests
def test_infer(engine, template, infer_requests):
from swift.llm import RequestConfig
from swift.plugin import InferStats
request_config = RequestConfig(temperature=0)
infer_stats = InferStats()
response_list = engine.infer(
infer_requests, template=template, request_config=request_config, metrics=[infer_stats])
for response in response_list[:2]:
print(response.choices[0].message.content)
print(infer_stats.compute())
def test_stream(engine, template, infer_requests):
from swift.llm import RequestConfig
from swift.plugin import InferStats
infer_stats = InferStats()
request_config = RequestConfig(temperature=0, stream=True, logprobs=True)
gen_list = engine.infer(infer_requests, template=template, request_config=request_config, metrics=[infer_stats])
for response in gen_list[0]:
if response is None:
continue
print(response.choices[0].delta.content, end='', flush=True)
print()
print(infer_stats.compute())
gen_list = engine.infer(
infer_requests, template=template, request_config=request_config, use_tqdm=True, metrics=[infer_stats])
for response in gen_list[0]:
pass
print(infer_stats.compute())
if __name__ == '__main__':
engine, template, infer_requests = _prepare(infer_backend='pt')
test_infer(engine, template, infer_requests)
test_stream(engine, template, infer_requests)
|