|
def _test_client(port=8000): |
|
import time |
|
import aiohttp |
|
from swift.llm import InferClient, InferRequest, RequestConfig, load_dataset, run_deploy |
|
dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], num_proc=4) |
|
infer_client = InferClient(port=port) |
|
while True: |
|
try: |
|
infer_client.models |
|
break |
|
except Exception: |
|
time.sleep(1) |
|
pass |
|
infer_requests = [] |
|
for data in dataset[0]: |
|
infer_requests.append(InferRequest(**data)) |
|
request_config = RequestConfig(seed=42, max_tokens=256, temperature=0.8) |
|
|
|
resp = infer_client.infer(infer_requests, request_config=request_config, use_tqdm=False) |
|
print(len(resp)) |
|
|
|
|
|
def _test(infer_backend): |
|
import os |
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' |
|
|
|
from swift.llm import DeployArguments |
|
from swift.llm import run_deploy |
|
args = DeployArguments(model='Qwen/Qwen2-7B-Instruct', infer_backend=infer_backend, verbose=False) |
|
with run_deploy(args) as port: |
|
_test_client(port) |
|
|
|
|
|
def test_vllm(): |
|
_test('vllm') |
|
|
|
|
|
def test_lmdeploy(): |
|
_test('lmdeploy') |
|
|
|
|
|
def test_pt(): |
|
_test('pt') |
|
|
|
|
|
def test_vllm_origin(): |
|
import subprocess |
|
import sys |
|
from modelscope import snapshot_download |
|
model_dir = snapshot_download('Qwen/Qwen2-7B-Instruct') |
|
args = [sys.executable, '-m', 'vllm.entrypoints.openai.api_server', '--model', model_dir] |
|
process = subprocess.Popen(args) |
|
_test_client() |
|
process.terminate() |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
test_lmdeploy() |
|
|
|
|