Upload 3 files

Browse files

Files changed (3) hide show

Inference.py +220 -0
inference_1.py +139 -0
test_sft_nothink_10.json +12 -0

Inference.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from vllm import LLM, SamplingParams
+import argparse
+import json
+import os
+import time
+import datetime
+def setup_model(model_path, tensor_parallel_size=None, dtype="bfloat16", gpu_memory_utilization=0.85):
+    """
+    Initialize the fine-tuned Qwen-2.5-7B model from a local path with explicit GPU configuration.
+    Args:
+        model_path: Path to the directory containing the trained model
+        tensor_parallel_size: Number of GPUs to use for tensor parallelism (None means auto-detect)
+        dtype: Data type for model weights (bfloat16, float16, or float32)
+        gpu_memory_utilization: Fraction of GPU memory to use (0.0 to 1.0)
+    """
+    print(f"Loading fine-tuned Qwen model from: {model_path}")
+    print(f"GPU configuration: tensor_parallel_size={tensor_parallel_size}, dtype={dtype}, "
+          f"gpu_memory_utilization={gpu_memory_utilization}")
+    # Initialize the model with VLLM using GPU settings
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        tensor_parallel_size=tensor_parallel_size,  # Number of GPUs to use
+        dtype=dtype,  # Data type for model weights
+        gpu_memory_utilization=gpu_memory_utilization,  # Memory usage per GPU
+        enforce_eager=False,  # Set to True if you encounter CUDA issues
+        # max_model_len=8192,  # Uncomment if you need longer context
+    )
+    print("Model loaded successfully!")
+    return llm
+def generate_response(llm, prompt, temperature=0.7, max_tokens=512, top_p=0.9):
+    """Generate a response for a given prompt."""
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens
+    )
+    outputs = llm.generate([prompt], sampling_params)
+    return outputs[0].outputs[0].text
+def chat_completion(llm, messages, temperature=0.7, max_tokens=512):
+    """Generate a chat completion from messages."""
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        top_p=0.9,
+        max_tokens=max_tokens
+    )
+    # Convert messages to a prompt using the model's chat template
+    tokenizer = llm.get_tokenizer()
+    if hasattr(tokenizer, "apply_chat_template"):
+        # For newer transformers versions
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+    else:
+        # Fallback for models without chat template
+        prompt = format_messages_manually(messages)
+    outputs = llm.generate([prompt], sampling_params)
+    return outputs[0].outputs[0].text
+def format_messages_manually(messages):
+    """Format messages manually if chat template is not available."""
+    formatted_prompt = ""
+    for message in messages:
+        role = message["role"]
+        content = message["content"]
+        if role == "system":
+            formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
+        elif role == "user":
+            formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
+        elif role == "assistant":
+            formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
+    formatted_prompt += "<|im_start|>assistant\n"
+    return formatted_prompt
+def batch_inference(llm, prompts, temperature=0.7, max_tokens=512):
+    """Run batch inference on multiple prompts."""
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        top_p=0.9,
+        max_tokens=max_tokens
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    return [output.outputs[0].text for output in outputs]
+def save_to_json(data, output_path=None):
+    """Save results to a JSON file."""
+    if not output_path:
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_path = f"qwen_inference_results_{timestamp}.json"
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    print(f"Results saved to: {output_path}")
+    return output_path
+def main():
+    parser = argparse.ArgumentParser(description="GPU inference with fine-tuned Qwen model with JSON output")
+    parser.add_argument("--model_path", required=True, help="Path to the fine-tuned model directory")
+    parser.add_argument("--mode", choices=["single", "chat", "batch"], default="single", help="Inference mode")
+    parser.add_argument("--prompt", help="Prompt for single inference mode")
+    parser.add_argument("--prompt_file", help="File containing prompts for batch mode (one per line)")
+    parser.add_argument("--output_file", help="Path to save JSON results (default: auto-generated)")
+    parser.add_argument("--max_tokens", type=int, default=512, help="Maximum tokens in response")
+    parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for sampling")
+    parser.add_argument("--gpu_count", type=int, help="Number of GPUs to use (default: all available)")
+    parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="bfloat16", help="Data type for weights")
+    parser.add_argument("--gpu_memory_utilization", type=float, default=0.85, help="GPU memory utilization (0.0-1.0)")
+    args = parser.parse_args()
+    # Initialize the model with specified GPU settings
+    llm = setup_model(
+        model_path=args.model_path,
+        tensor_parallel_size=args.gpu_count,
+        dtype=args.dtype,
+        gpu_memory_utilization=args.gpu_memory_utilization
+    )
+    results = {}
+    if args.mode == "single":
+        if not args.prompt:
+            args.prompt = input("Enter your prompt: ")
+        print("\nGenerating response...")
+        start_time = time.time()
+        response = generate_response(
+            llm,
+            args.prompt,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens
+        )
+        end_time = time.time()
+        print(f"\nResponse:\n{response}")
+        results = {
+            "mode": "single",
+            "timestamp": datetime.datetime.now().isoformat(),
+            "input": args.prompt,
+            "output": response,
+            "parameters": {
+                "temperature": args.temperature,
+                "max_tokens": args.max_tokens
+            },
+            "performance": {
+                "time_seconds": end_time - start_time
+            }
+        }
+    elif args.mode == "chat":
+        # For chat mode, we'll save the entire conversation history
+        messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
+        results = {
+            "mode": "chat",
+            "timestamp": datetime.datetime.now().isoformat(),
+            "conversation": []
+        }
+        print("\nChat mode. Type 'exit' or 'quit' to end the conversation and save to JSON.\n")
+        while True:
+            user_input = input("\nYou: ")
+            if user_input.lower() in ["exit", "quit"]:
+                print("Ending conversation and saving results...")
+                break
+            messages.append({"role": "user", "content": user_input})
+            start_time = time.time()
+            response = chat_completion(
+                llm,
+                messages,
+                temperature=args.temperature,
+                max_tokens=args.max_tokens
+            )
+            end_time = time.time()
+            print(f"\nAssistant: {response}")
+            messages.append({"role": "assistant", "content": response})
+            # Add this exchange to results
+            results["conversation"].append({
+                "user": user_input,
+                "assistant": response,
+                "time_seconds": end_time - start_time
+            })
+    elif args.mode == "batch":
+        if not args.prompt_file:
+            print("Error: --prompt_file required for batch mode")
+            return
+        with open(args.prompt_file, 'r', encoding='utf-8') as f:
+            prompts = json.load(f)
+        print(f"Running batch inference on {len(prompts)} prompts...")
+        inference_results = batch_inference(
+            llm,
+            prompts,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens
+        )
+        with open(args.output_file, "w") as final:
+            json.dump(inference_results, final)
+if __name__ == "__main__":
+    main()

inference_1.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from vllm import LLM, SamplingParams
+import argparse
+import json
+def setup_model(model_path):
+    """
+    Initialize the fine-tuned Qwen-2.5-7B model from a local path.
+    Args:
+        model_path: Path to the directory containing the trained model
+    """
+    print(f"Loading fine-tuned Qwen model from: {model_path}")
+    # Initialize the model with VLLM using local path
+    # trust_remote_code=True is required for custom Qwen model code
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        # Optional parameters for performance tuning
+        # tensor_parallel_size=2,  # Use multiple GPUs
+        # dtype="bfloat16",  # Use bfloat16 for more efficient inference
+        # gpu_memory_utilization=0.85  # Control memory usage
+    )
+    print("Model loaded successfully!")
+    return llm
+def generate_response(llm, prompt, temperature=0.7, max_tokens=512, top_p=0.9):
+    """Generate a response for a given prompt."""
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens
+    )
+    outputs = llm.generate([prompt], sampling_params)
+    return outputs[0].outputs[0].text
+def chat_completion(llm, messages, temperature=0.7, max_tokens=512):
+    """Generate a chat completion from messages."""
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        top_p=0.9,
+        max_tokens=max_tokens
+    )
+    # Convert messages to a prompt using the model's chat template
+    tokenizer = llm.get_tokenizer()
+    if hasattr(tokenizer, "apply_chat_template"):
+        # For newer transformers versions
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+    else:
+        # Fallback for models without chat template
+        prompt = format_messages_manually(messages)
+    outputs = llm.generate([prompt], sampling_params)
+    return outputs[0].outputs[0].text
+def format_messages_manually(messages):
+    """Format messages manually if chat template is not available."""
+    formatted_prompt = ""
+    for message in messages:
+        role = message["role"]
+        content = message["content"]
+        if role == "system":
+            formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
+        elif role == "user":
+            formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
+        elif role == "assistant":
+            formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
+    formatted_prompt += "<|im_start|>assistant\n"
+    return formatted_prompt
+def batch_inference(llm, prompts, temperature=0.7, max_tokens=512):
+    """Run batch inference on multiple prompts."""
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        top_p=0.9,
+        max_tokens=max_tokens
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    return [output.outputs[0].text for output in outputs]
+def main():
+    parser = argparse.ArgumentParser(description="Inference with fine-tuned Qwen-2.5-7B model")
+    parser.add_argument("--model_path", required=True, help="Path to the fine-tuned model directory")
+    parser.add_argument("--mode", choices=["single", "chat", "batch"], default="single", help="Inference mode")
+    parser.add_argument("--prompt", help="Prompt for single inference mode")
+    parser.add_argument("--prompt_file", help="File containing prompts for batch mode (one per line)")
+    args = parser.parse_args()
+    # Initialize the model
+    llm = setup_model(args.model_path)
+    if args.mode == "single":
+        if not args.prompt:
+            args.prompt = input("Enter your prompt: ")
+        print("\nGenerating response...")
+        response = generate_response(llm, args.prompt)
+        print(f"\nResponse:\n{response}")
+    elif args.mode == "chat":
+        messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
+        print("\nChat mode. Type 'exit' or 'quit' to end the conversation.\n")
+        while True:
+            user_input = input("\nYou: ")
+            if user_input.lower() in ["exit", "quit"]:
+                print("Goodbye!")
+                break
+            messages.append({"role": "user", "content": user_input})
+            response = chat_completion(llm, messages)
+            print(f"\nAssistant: {response}")
+            messages.append({"role": "assistant", "content": response})
+    elif args.mode == "batch":
+        if not args.prompt_file:
+            print("Error: --prompt_file required for batch mode")
+            return
+        with open(args.prompt_file, 'r') as f:
+            prompts = [line.strip() for line in f if line.strip()]
+        print(f"Running batch inference on {len(prompts)} prompts...")
+        responses = batch_inference(llm, prompts)
+        with open("./test.json", "w") as final:
+            json.dump(responses, final)
+if __name__ == "__main__":
+    main()

test_sft_nothink_10.json ADDED Viewed

	@@ -0,0 +1,12 @@

+[
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Evaluate the titles of Product 1 and Product 2 to assess their similarity and whether they are likely to be purchased or viewed together. Then, select the appropriate option.\n\nInput:\nProduct 1: Cerwin-Vega XED52 Speaker 275 W PMPO 2-Way, 2 Count, Black\nProduct 2: Rockford R169X2 6 x 9 Inches Full Range Coaxial Speaker, Set of 2\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Given the title of two products, predict if the two products are similar, if the two products will be purchased or viewed together. Answer only from the options.\n\nInput:\nProduct 1:  Kenable Internal Memory Card Reader for 5.25 CD/DVD Bay With USB Port BLACK\nProduct 2: CORSAIR Carbide 100R Mid-Tower Case\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Analyze the titles of Product 1 and Product 2 to determine if they are similar, if they will be purchased or viewed together, and choose the corresponding option.\n\nInput:\nProduct 1: Master Half Dozen Red Pool Cue Chalk\nProduct 2: Premium Pool Table Billiard Cue Chalk 12 Pieces Choose Blue, Green, Black, Purple, Pink, Hot Pink, or Mustard\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Evaluate the titles of Product 1 and Product 2 to assess their similarity and whether they are likely to be purchased or viewed together. Then, select the appropriate option.\n\nInput:\nProduct 1: Mossy Oak Full Spandex Face Mask\nProduct 2: Scent Control Spray - Remington Hunting Odor Elimination Spray - 24 oz\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Predict whether two products are similar, whether two products are likely to be purchased or viewed together based on their titles. Choose your answer from the provided options.\n\nInput:\nProduct 1: Monoprice 11952 Polyurethane Replacement Ear Pads for PID 8323 type Headphones - Red\nProduct 2: Monoprice Hi-Fi Light Weight Over the Ear Headphones - Black with a 50mm driver and a 47in 3.5mm cable for Apple Iphone iPod Android Smartphone Samsung Galaxy Tablets MP3\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Analyze the titles of Product 1 and Product 2 to determine if they are similar, if they will be purchased or viewed together, and choose the corresponding option.\n\nInput:\nProduct 1: Coleman Twin High Performance LED Lantern\nProduct 2: Coleman Twin LED Lantern\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Evaluate the titles of Product 1 and Product 2 to assess their similarity and whether they are likely to be purchased or viewed together. Then, select the appropriate option.\n\nInput:\nProduct 1: TOOGOO(R) Pocket Pen Fishing Rod + 4.3:1 Spinning Reel Tackle Set\nProduct 2: Zebco Zcast 5'6&quot; 2Piece Medium-Light Action Rod Casting\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Evaluate the titles of Product 1 and Product 2 to assess their similarity and whether they are likely to be purchased or viewed together. Then, select the appropriate option.\n\nInput:\nProduct 1: Hiware 12-piece Good Stainless Steel Teaspoon, 6.7 Inches\nProduct 2: Winco 0001-06 12-Piece Dominion Salad Fork Set, 18-0 Stainless Steel\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Evaluate the titles of Product 1 and Product 2 to assess their similarity and whether they are likely to be purchased or viewed together. Then, select the appropriate option.\n\nInput:\nProduct 1: 55mm Wide Angle Lens for Nikon D3400 with 18-55MM AF-P DX , D5600 with 18-55MM AF-P DX, DL24-500, DL 24-500MM Digital Camera\nProduct 2: Powerextra EN-EL14 EN-EL14a 2 x Battery &amp; Car Charger Compatible with Nikon D3100 D3200 D3300 D3400 D3500 D5100 D5200 D5300 D5500 D5600 P7000 P7100 P7200 P7700 P7800 DSLR Cameras\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>",
+  "<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Now the user gives an instruction that describes a task and ask you write an answer that appropriately completes the request. After thinking, when you finally reach a conclusion, clearly state the answer within <answer> </answer> tags.\n<|im_end|>\n<|im_start|>user\nInstruction: Given the title of two products, predict if the two products are similar, if the two products will be purchased or viewed together. Answer only from the options.\n\nInput:\nProduct 1: Sticky Holsters MD-2 Medium\nProduct 2: TRUGLO TFX PRO Tritium &amp; Fiber-Optic Xtreme Handgun Sights, Ruger LC Set (TG13RS2PC)\n\nOptions:\nA: Users who buy product 1 may also buy product 2.\nB: Users who view product 1 may also view product 2.\nC: The product 1 is similar with the product 2.\n<|im_end|>\n<|im_start|>assistant\n<think>Okay, I think I have finished thinking.</think>"
+]