|
|
|
import os |
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' |
|
|
|
|
|
|
|
def infer(engine: 'InferEngine', infer_request: 'InferRequest'): |
|
stop = [engine.default_template.agent_template.keyword.observation] |
|
request_config = RequestConfig(max_tokens=512, temperature=0, stop=stop) |
|
resp_list = engine.infer([infer_request], request_config) |
|
query = infer_request.messages[0]['content'] |
|
response = resp_list[0].choices[0].message.content |
|
print(f'query: {query}') |
|
print(f'response: {response}') |
|
print(f'tool_calls: {resp_list[0].choices[0].message.tool_calls}') |
|
|
|
tool = '{"temperature": 32, "condition": "Sunny", "humidity": 50}' |
|
print(f'tool_response: {tool}') |
|
infer_request.messages += [{'role': 'assistant', 'content': response}, {'role': 'tool', 'content': tool}] |
|
resp_list = engine.infer([infer_request], request_config) |
|
response2 = resp_list[0].choices[0].message.content |
|
print(f'response2: {response2}') |
|
|
|
|
|
def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'): |
|
stop = [engine.default_template.agent_template.keyword.observation] |
|
request_config = RequestConfig(max_tokens=512, temperature=0, stream=True, stop=stop) |
|
gen_list = engine.infer([infer_request], request_config) |
|
query = infer_request.messages[0]['content'] |
|
response = '' |
|
print(f'query: {query}\nresponse: ', end='') |
|
for resp in gen_list[0]: |
|
if resp is None: |
|
continue |
|
delta = resp.choices[0].delta.content |
|
response += delta |
|
print(delta, end='', flush=True) |
|
print() |
|
print(f'tool_calls: {resp.choices[0].delta.tool_calls}') |
|
|
|
tool = '{"temperature": 32, "condition": "Sunny", "humidity": 50}' |
|
print(f'tool_response: {tool}\nresponse2: ', end='') |
|
infer_request.messages += [{'role': 'assistant', 'content': response}, {'role': 'tool', 'content': tool}] |
|
gen_list = engine.infer([infer_request], request_config) |
|
for resp in gen_list[0]: |
|
if resp is None: |
|
continue |
|
print(resp.choices[0].delta.content, end='', flush=True) |
|
print() |
|
|
|
|
|
def get_infer_request(): |
|
return InferRequest( |
|
messages=[{ |
|
'role': 'user', |
|
'content': "How's the weather in Beijing today?" |
|
}], |
|
tools=[{ |
|
'name': 'get_current_weather', |
|
'description': 'Get the current weather in a given location', |
|
'parameters': { |
|
'type': 'object', |
|
'properties': { |
|
'location': { |
|
'type': 'string', |
|
'description': 'The city and state, e.g. San Francisco, CA' |
|
}, |
|
'unit': { |
|
'type': 'string', |
|
'enum': ['celsius', 'fahrenheit'] |
|
} |
|
}, |
|
'required': ['location'] |
|
} |
|
}]) |
|
|
|
|
|
def infer_continue_generate(engine): |
|
|
|
infer_request = InferRequest(messages=[{ |
|
'role': 'user', |
|
'content': 'How is the weather today?' |
|
}, { |
|
'role': 'assistant', |
|
'content': 'It is sunny today, ' |
|
}, { |
|
'role': 'assistant', |
|
'content': None |
|
}]) |
|
request_config = RequestConfig(max_tokens=512, temperature=0) |
|
resp_list = engine.infer([infer_request], request_config) |
|
response = resp_list[0].choices[0].message.content |
|
print(f'response: {response}') |
|
|
|
|
|
if __name__ == '__main__': |
|
from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig |
|
from swift.plugin import agent_templates |
|
model = 'Qwen/Qwen2.5-1.5B-Instruct' |
|
infer_backend = 'pt' |
|
|
|
if infer_backend == 'pt': |
|
engine = PtEngine(model, max_batch_size=64) |
|
elif infer_backend == 'vllm': |
|
from swift.llm import VllmEngine |
|
engine = VllmEngine(model, max_model_len=8192) |
|
elif infer_backend == 'lmdeploy': |
|
from swift.llm import LmdeployEngine |
|
engine = LmdeployEngine(model) |
|
|
|
|
|
|
|
|
|
infer(engine, get_infer_request()) |
|
infer_stream(engine, get_infer_request()) |
|
|
|
|
|
|