Evaluation

#1
by tugstugi - opened

What were the generation parameters for the model? temperature, top_p etc. On AIME24, I can't reproduce the results.

PowerInfer org

I use the same setting in the generation_config. I will add my eval_result in this repo today.

You can refer to results/amc23_eval.json. And I have uploaded aime_eval.json.

Thanks for the info. Could you provide the evaluation script? I still can't reproduce the results with VLLLM with the given generation_config.json

PowerInfer org

Yes. I will provide it later. I run the evaluation with transformers.

import re
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

import re

def extract_final_answer(text):
    patterns = [ 
        r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*\\boxed\s*{\s*([0-9.-]+)\s*}\s*\\\]",
        r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*([0-9.-]+)\s*\\\]",
        r"\*?Final\s+Answer\*?\s*[:=]\s*([0-9.-]+)",
        r"[Tt]he\s+[Ff]inal\s+[Aa]nswer\s+[Ii]s\s*[:=]?\s*([0-9.-]+)",
        r"[Ff]inal\s+[Aa]nswer\s*[:=]\s*([0-9.-]+)",
    ]   

    # text_normalized = text.replace('\\\\', '\\')
    text_normalized = text
    print(text_normalized)

    for i, pattern in enumerate(patterns):
        match = re.search(pattern, text_normalized, re.DOTALL)
        if match:
            result = match.group(1).strip()
            return result
        else:
            print("Not found match")

    return None

def evaluate_answer(predicted, actual):
    if predicted is None:
        return False
    try:
        pred_val = float(predicted)
        actual_val = float(actual)
        return abs(pred_val - actual_val) < 1e-5
    except:
        return predicted.strip() == actual.strip()

def run_inference(model, tokenizer, question, max_new_tokens=16384):
    prompt = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question},
    ]
    
    input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    
    generated_ids = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        use_cache=True
    )
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def main():
    # 1. Load model and tokenizer
    model_path = "saves/qwen2-01/full/sft/checkpoint-44000"  # Replace with your model_path
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="cuda"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # 2. Load the dataset
    dataset = load_dataset("json", data_files="/home/syx/Qwen2.5-Math/evaluation/data/aime24/test.jsonl")  # Replace with the custom dataset
    eval_dataset = dataset["train"]
    
    # 3. Inference and validation
    results = {
        "correct": 0,
        "total": 0,
        "predictions": []
    }
    
    for item in tqdm(eval_dataset):
        question = item["question"]
        ground_truth = item["answer"] if "answer" in item else None
        
        # Inference
        start_time = time.time()
        response = run_inference(model, tokenizer, question)
        inference_time = time.time() - start_time
        
        # Extract
        predicted_answer = extract_final_answer(response)
        #print(response)
        #print(predicted_answer)
        
        # Evaluate
        is_correct = None
        if ground_truth is not None:
            is_correct = evaluate_answer(predicted_answer, ground_truth)
            print(is_correct)
            results["correct"] += int(is_correct)
            results["total"] += 1
            
        # Save
        results["predictions"].append({
            "question": question,
            "response": response,
            "extracted_answer": predicted_answer,
            "ground_truth": ground_truth,
            "is_correct": is_correct,
            "inference_time": inference_time
        })
        
    # 4. Output 
    if results["total"] > 0:
        accuracy = results["correct"] / results["total"] * 100
        print(f"\nAccuracy: {accuracy:.2f}%")
        print(f"Correct: {results['correct']}/{results['total']}")
    
    # Save to Disk 
    import json
    with open("aime24.jsonl", "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()

I think you can refer to this code. From running this code several times in my testing, the number of correct instances tends to fluctuate around 5. I believe this fluctuation reflects the current limitations of the model - the high variability in the answers obtained. We plan to introduce some RL algorithms in the future to solve this limitation.

tugstugi changed discussion status to closed

Sign up or log in to comment