|
|
from vllm import LLM, SamplingParams |
|
|
import argparse |
|
|
import json |
|
|
import os |
|
|
import time |
|
|
import datetime |
|
|
|
|
|
def setup_model(model_path, tensor_parallel_size=None, dtype="bfloat16", gpu_memory_utilization=0.85): |
|
|
""" |
|
|
Initialize the fine-tuned Qwen-2.5-7B model from a local path with explicit GPU configuration. |
|
|
|
|
|
Args: |
|
|
model_path: Path to the directory containing the trained model |
|
|
tensor_parallel_size: Number of GPUs to use for tensor parallelism (None means auto-detect) |
|
|
dtype: Data type for model weights (bfloat16, float16, or float32) |
|
|
gpu_memory_utilization: Fraction of GPU memory to use (0.0 to 1.0) |
|
|
""" |
|
|
print(f"Loading fine-tuned Qwen model from: {model_path}") |
|
|
print(f"GPU configuration: tensor_parallel_size={tensor_parallel_size}, dtype={dtype}, " |
|
|
f"gpu_memory_utilization={gpu_memory_utilization}") |
|
|
|
|
|
|
|
|
llm = LLM( |
|
|
model=model_path, |
|
|
trust_remote_code=True, |
|
|
tensor_parallel_size=tensor_parallel_size, |
|
|
dtype=dtype, |
|
|
gpu_memory_utilization=gpu_memory_utilization, |
|
|
enforce_eager=False, |
|
|
|
|
|
) |
|
|
|
|
|
print("Model loaded successfully!") |
|
|
return llm |
|
|
|
|
|
def generate_response(llm, prompt, temperature=0.7, max_tokens=512, top_p=0.9): |
|
|
"""Generate a response for a given prompt.""" |
|
|
sampling_params = SamplingParams( |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
max_tokens=max_tokens |
|
|
) |
|
|
|
|
|
outputs = llm.generate([prompt], sampling_params) |
|
|
return outputs[0].outputs[0].text |
|
|
|
|
|
def chat_completion(llm, messages, temperature=0.7, max_tokens=512): |
|
|
"""Generate a chat completion from messages.""" |
|
|
sampling_params = SamplingParams( |
|
|
temperature=temperature, |
|
|
top_p=0.9, |
|
|
max_tokens=max_tokens |
|
|
) |
|
|
|
|
|
|
|
|
tokenizer = llm.get_tokenizer() |
|
|
if hasattr(tokenizer, "apply_chat_template"): |
|
|
|
|
|
prompt = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
else: |
|
|
|
|
|
prompt = format_messages_manually(messages) |
|
|
|
|
|
outputs = llm.generate([prompt], sampling_params) |
|
|
return outputs[0].outputs[0].text |
|
|
|
|
|
def format_messages_manually(messages): |
|
|
"""Format messages manually if chat template is not available.""" |
|
|
formatted_prompt = "" |
|
|
for message in messages: |
|
|
role = message["role"] |
|
|
content = message["content"] |
|
|
if role == "system": |
|
|
formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n" |
|
|
elif role == "user": |
|
|
formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n" |
|
|
elif role == "assistant": |
|
|
formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n" |
|
|
formatted_prompt += "<|im_start|>assistant\n" |
|
|
return formatted_prompt |
|
|
|
|
|
def batch_inference(llm, prompts, temperature=0.7, max_tokens=512): |
|
|
"""Run batch inference on multiple prompts.""" |
|
|
sampling_params = SamplingParams( |
|
|
temperature=temperature, |
|
|
top_p=0.9, |
|
|
max_tokens=max_tokens |
|
|
) |
|
|
|
|
|
outputs = llm.generate(prompts, sampling_params) |
|
|
return [output.outputs[0].text for output in outputs] |
|
|
|
|
|
def save_to_json(data, output_path=None): |
|
|
"""Save results to a JSON file.""" |
|
|
if not output_path: |
|
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
output_path = f"qwen_inference_results_{timestamp}.json" |
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(data, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"Results saved to: {output_path}") |
|
|
return output_path |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="GPU inference with fine-tuned Qwen model with JSON output") |
|
|
parser.add_argument("--model_path", required=True, help="Path to the fine-tuned model directory") |
|
|
parser.add_argument("--mode", choices=["single", "chat", "batch"], default="single", help="Inference mode") |
|
|
parser.add_argument("--prompt", help="Prompt for single inference mode") |
|
|
parser.add_argument("--prompt_file", help="File containing prompts for batch mode (one per line)") |
|
|
parser.add_argument("--output_file", help="Path to save JSON results (default: auto-generated)") |
|
|
parser.add_argument("--max_tokens", type=int, default=512, help="Maximum tokens in response") |
|
|
parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for sampling") |
|
|
parser.add_argument("--gpu_count", type=int, help="Number of GPUs to use (default: all available)") |
|
|
parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="bfloat16", help="Data type for weights") |
|
|
parser.add_argument("--gpu_memory_utilization", type=float, default=0.85, help="GPU memory utilization (0.0-1.0)") |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
llm = setup_model( |
|
|
model_path=args.model_path, |
|
|
tensor_parallel_size=args.gpu_count, |
|
|
dtype=args.dtype, |
|
|
gpu_memory_utilization=args.gpu_memory_utilization |
|
|
) |
|
|
|
|
|
results = {} |
|
|
|
|
|
if args.mode == "single": |
|
|
if not args.prompt: |
|
|
args.prompt = input("Enter your prompt: ") |
|
|
|
|
|
print("\nGenerating response...") |
|
|
start_time = time.time() |
|
|
response = generate_response( |
|
|
llm, |
|
|
args.prompt, |
|
|
temperature=args.temperature, |
|
|
max_tokens=args.max_tokens |
|
|
) |
|
|
end_time = time.time() |
|
|
|
|
|
print(f"\nResponse:\n{response}") |
|
|
|
|
|
results = { |
|
|
"mode": "single", |
|
|
"timestamp": datetime.datetime.now().isoformat(), |
|
|
"input": args.prompt, |
|
|
"output": response, |
|
|
"parameters": { |
|
|
"temperature": args.temperature, |
|
|
"max_tokens": args.max_tokens |
|
|
}, |
|
|
"performance": { |
|
|
"time_seconds": end_time - start_time |
|
|
} |
|
|
} |
|
|
|
|
|
elif args.mode == "chat": |
|
|
|
|
|
messages = [{"role": "system", "content": "You are a helpful AI assistant."}] |
|
|
results = { |
|
|
"mode": "chat", |
|
|
"timestamp": datetime.datetime.now().isoformat(), |
|
|
"conversation": [] |
|
|
} |
|
|
|
|
|
print("\nChat mode. Type 'exit' or 'quit' to end the conversation and save to JSON.\n") |
|
|
|
|
|
while True: |
|
|
user_input = input("\nYou: ") |
|
|
if user_input.lower() in ["exit", "quit"]: |
|
|
print("Ending conversation and saving results...") |
|
|
break |
|
|
|
|
|
messages.append({"role": "user", "content": user_input}) |
|
|
|
|
|
start_time = time.time() |
|
|
response = chat_completion( |
|
|
llm, |
|
|
messages, |
|
|
temperature=args.temperature, |
|
|
max_tokens=args.max_tokens |
|
|
) |
|
|
end_time = time.time() |
|
|
|
|
|
print(f"\nAssistant: {response}") |
|
|
messages.append({"role": "assistant", "content": response}) |
|
|
|
|
|
|
|
|
results["conversation"].append({ |
|
|
"user": user_input, |
|
|
"assistant": response, |
|
|
"time_seconds": end_time - start_time |
|
|
}) |
|
|
|
|
|
elif args.mode == "batch": |
|
|
if not args.prompt_file: |
|
|
print("Error: --prompt_file required for batch mode") |
|
|
return |
|
|
with open(args.prompt_file, 'r', encoding='utf-8') as f: |
|
|
prompts = json.load(f) |
|
|
|
|
|
print(f"Running batch inference on {len(prompts)} prompts...") |
|
|
inference_results = batch_inference( |
|
|
llm, |
|
|
prompts, |
|
|
temperature=args.temperature, |
|
|
max_tokens=args.max_tokens |
|
|
) |
|
|
|
|
|
with open(args.output_file, "w") as final: |
|
|
json.dump(inference_results, final) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |