PowerInfer/SmallThinker-3B-Preview

import re
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

import re

def extract_final_answer(text):
    patterns = [ 
        r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*\\boxed\s*{\s*([0-9.-]+)\s*}\s*\\\]",
        r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*([0-9.-]+)\s*\\\]",
        r"\*?Final\s+Answer\*?\s*[:=]\s*([0-9.-]+)",
        r"[Tt]he\s+[Ff]inal\s+[Aa]nswer\s+[Ii]s\s*[:=]?\s*([0-9.-]+)",
        r"[Ff]inal\s+[Aa]nswer\s*[:=]\s*([0-9.-]+)",
    ]   

    # text_normalized = text.replace('\\\\', '\\')
    text_normalized = text
    print(text_normalized)

    for i, pattern in enumerate(patterns):
        match = re.search(pattern, text_normalized, re.DOTALL)
        if match:
            result = match.group(1).strip()
            return result
        else:
            print("Not found match")

    return None

def evaluate_answer(predicted, actual):
    if predicted is None:
        return False
    try:
        pred_val = float(predicted)
        actual_val = float(actual)
        return abs(pred_val - actual_val) < 1e-5
    except:
        return predicted.strip() == actual.strip()

def run_inference(model, tokenizer, question, max_new_tokens=16384):
    prompt = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question},
    ]
    
    input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    
    generated_ids = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        use_cache=True
    )
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def main():
    # 1. Load model and tokenizer
    model_path = "saves/qwen2-01/full/sft/checkpoint-44000"  # Replace with your model_path
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="cuda"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # 2. Load the dataset
    dataset = load_dataset("json", data_files="/home/syx/Qwen2.5-Math/evaluation/data/aime24/test.jsonl")  # Replace with the custom dataset
    eval_dataset = dataset["train"]
    
    # 3. Inference and validation
    results = {
        "correct": 0,
        "total": 0,
        "predictions": []
    }
    
    for item in tqdm(eval_dataset):
        question = item["question"]
        ground_truth = item["answer"] if "answer" in item else None
        
        # Inference
        start_time = time.time()
        response = run_inference(model, tokenizer, question)
        inference_time = time.time() - start_time
        
        # Extract
        predicted_answer = extract_final_answer(response)
        #print(response)
        #print(predicted_answer)
        
        # Evaluate
        is_correct = None
        if ground_truth is not None:
            is_correct = evaluate_answer(predicted_answer, ground_truth)
            print(is_correct)
            results["correct"] += int(is_correct)
            results["total"] += 1
            
        # Save
        results["predictions"].append({
            "question": question,
            "response": response,
            "extracted_answer": predicted_answer,
            "ground_truth": ground_truth,
            "is_correct": is_correct,
            "inference_time": inference_time
        })
        
    # 4. Output 
    if results["total"] > 0:
        accuracy = results["correct"] / results["total"] * 100
        print(f"\nAccuracy: {accuracy:.2f}%")
        print(f"Correct: {results['correct']}/{results['total']}")
    
    # Save to Disk 
    import json
    with open("aime24.jsonl", "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()

I think you can refer to this code. From running this code several times in my testing, the number of correct instances tends to fluctuate around 5. I believe this fluctuation reflects the current limitations of the model - the high variability in the answers obtained. We plan to introduce some RL algorithms in the future to solve this limitation.

tugstugi

Dec 31, 2024

Thanks.

tugstugi changed discussion status to closed Dec 31, 2024

PowerInfer
/

SmallThinker-3B-Preview

Evaluation